Merge caffe2 with pytorch.

This commit is contained in:
Edward Z. Yang
2018-03-30 10:29:50 -07:00
1983 changed files with 369779 additions and 20 deletions

0
.github/CONTRIBUTING.md vendored Normal file
View File

View File

@ -6,6 +6,7 @@ We like to limit our issues to bug reports and feature requests. If you have a q
If you are submitting a feature request, please preface the title with [feature request].
When submitting a bug report, please include the following information (where relevant):
- PyTorch or Caffe2:
- OS:
- PyTorch version:
- How you installed PyTorch (conda, pip, source):
@ -13,9 +14,11 @@ When submitting a bug report, please include the following information (where re
- CUDA/cuDNN version:
- GPU models and configuration:
- GCC version (if compiling from source):
- CMake version:
- Build command you used (if compiling from source):
- Versions of any other relevant libraries:
In addition, including the following information will also be very helpful for us to diagnose the problem:
- A script to reproduce the bug. Please try to provide as minimal of a test case as possible.
- Error messages and/or stack traces of the bug
- Context around what you are trying to do

0
.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View File

111
.gitignore vendored
View File

@ -1,3 +1,5 @@
## PyTorch
build/
dist/
torch.egg-info/
@ -60,3 +62,112 @@ test/data/linear.pt
compile_commands.json
*.egg-info/
docs/source/_static/img/activation/
## General
# Compiled Object files
*.slo
*.lo
*.o
*.cuo
*.obj
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Compiled protocol buffers
*.pb.h
*.pb.cc
*_pb2.py
# Compiled python
*.pyc
*.pyd
# Compiled MATLAB
*.mex*
# IPython notebook checkpoints
.ipynb_checkpoints
# Editor temporaries
*.swn
*.swo
*.swp
*~
# Sublime Text settings
*.sublime-workspace
*.sublime-project
# Eclipse Project settings
*.*project
.settings
# QtCreator files
*.user
# PyCharm files
.idea
# Visual Studio Code files
.vscode
.vs
# OSX dir files
.DS_Store
## Caffe2
# build, distribute, and bins (+ python proto bindings)
build
build_host_protoc
build_android
build_ios
build_*
.build_debug/*
.build_release/*
distribute/*
*.testbin
*.bin
cmake_build
.cmake_build
gen
.setuptools-cmake-build
.pytest_cache
# Bram
plsdontbreak
# Generated documentation
docs/_site
docs/gathered
_site
doxygen
docs/dev
# LevelDB files
*.sst
*.ldb
LOCK
LOG*
CURRENT
MANIFEST-*
# generated version file
caffe2/version.py
# setup.py intermediates
.eggs
caffe2.egg-info
# Atom/Watchman required file
.watchmanconfig

81
.gitmodules vendored
View File

@ -1,12 +1,3 @@
[submodule "torch/lib/gloo"]
path = third_party/gloo
url = https://github.com/facebookincubator/gloo
[submodule "torch/lib/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11
[submodule "torch/lib/nanopb"]
path = third_party/nanopb
url = https://github.com/nanopb/nanopb.git
[submodule "aten/src/ATen/cpu/cpuinfo"]
path = aten/src/ATen/cpu/cpuinfo
url = https://github.com/Maratyszcza/cpuinfo
@ -17,3 +8,75 @@
[submodule "aten/src/ATen/utils/catch"]
path = aten/src/ATen/utils/catch
url = https://github.com/catchorg/Catch2.git
[submodule "third_party/nanopb"]
path = third_party/nanopb
url = https://github.com/nanopb/nanopb.git
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "third_party/nccl"]
path = third_party/nccl
url = https://github.com/nvidia/nccl.git
[submodule "third_party/cub"]
path = third_party/cub
url = https://github.com/NVlabs/cub.git
[submodule "third_party/eigen"]
path = third_party/eigen
url = https://github.com/RLovelett/eigen.git
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest.git
[submodule "third_party/nervanagpu"]
path = third_party/nervanagpu
url = https://github.com/NervanaSystems/nervanagpu.git
[submodule "third_party/benchmark"]
path = third_party/benchmark
url = https://github.com/google/benchmark.git
[submodule "third_party/protobuf"]
path = third_party/protobuf
url = https://github.com/google/protobuf.git
[submodule "third_party/ios-cmake"]
path = third_party/ios-cmake
url = https://github.com/Yangqing/ios-cmake.git
[submodule "third_party/NNPACK"]
path = third_party/NNPACK
url = https://github.com/Maratyszcza/NNPACK.git
[submodule "third_party/gloo"]
path = third_party/gloo
url = https://github.com/facebookincubator/gloo
[submodule "third_party/NNPACK_deps/pthreadpool"]
path = third_party/pthreadpool
url = https://github.com/Maratyszcza/pthreadpool.git
[submodule "third_party/NNPACK_deps/FXdiv"]
path = third_party/FXdiv
url = https://github.com/Maratyszcza/FXdiv.git
[submodule "third_party/NNPACK_deps/FP16"]
path = third_party/FP16
url = https://github.com/Maratyszcza/FP16.git
[submodule "third_party/NNPACK_deps/psimd"]
path = third_party/psimd
url = https://github.com/Maratyszcza/psimd.git
[submodule "third_party/aten"]
path = third_party/aten
url = https://github.com/zdevito/aten
[submodule "third_party/zstd"]
path = third_party/zstd
url = https://github.com/facebook/zstd.git
[submodule "third-party/cpuinfo"]
path = third_party/cpuinfo
url = https://github.com/Maratyszcza/cpuinfo.git
[submodule "third_party/python-enum"]
path = third_party/python-enum
url = https://github.com/PeachPy/enum34.git
[submodule "third_party/python-peachpy"]
path = third_party/python-peachpy
url = https://github.com/Maratyszcza/PeachPy.git
[submodule "third_party/python-six"]
path = third_party/python-six
url = https://github.com/benjaminp/six.git
[submodule "third_party/ComputeLibrary"]
path = third_party/ComputeLibrary
url = https://github.com/ARM-software/ComputeLibrary.git
[submodule "third_party/onnx"]
path = third_party/onnx
url = https://github.com/onnx/onnx.git

14
.jenkins/caffe2/README.md Normal file
View File

@ -0,0 +1,14 @@
# Jenkins
The scripts in this directory are the entrypoint for testing Caffe2.
The environment variable `BUILD_ENVIRONMENT` is expected to be set to
the build environment you intend to test. It is a hint for the build
and test scripts to configure Caffe2 a certain way and include/exclude
tests. Docker images, they equal the name of the image itself. For
example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
built on Jenkins and are used in triggered builds already have this
environment variable set in their manifest. Also see
`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.

185
.jenkins/caffe2/build.sh Executable file
View File

@ -0,0 +1,185 @@
#!/bin/bash
set -ex
LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
# Setup sccache if SCCACHE_BUCKET is set
if [ -n "${SCCACHE_BUCKET}" ]; then
mkdir -p ./sccache
SCCACHE="$(which sccache)"
if [ -z "${SCCACHE}" ]; then
echo "Unable to find sccache..."
exit 1
fi
# Setup wrapper scripts
for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do
(
echo "#!/bin/sh"
echo "exec $SCCACHE $(which $compiler) \"\$@\""
) > "./sccache/$compiler"
chmod +x "./sccache/$compiler"
done
# CMake must find these wrapper scripts
export PATH="$PWD/sccache:$PATH"
fi
# Setup ccache if configured to use it (and not sccache)
if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then
mkdir -p ./ccache
ln -sf "$(which ccache)" ./ccache/cc
ln -sf "$(which ccache)" ./ccache/c++
ln -sf "$(which ccache)" ./ccache/gcc
ln -sf "$(which ccache)" ./ccache/g++
ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc
export CCACHE_WRAPPER_DIR="$PWD/ccache"
export PATH="$CCACHE_WRAPPER_DIR:$PATH"
fi
CMAKE_ARGS=("-DBUILD_BINARY=ON")
CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
CMAKE_ARGS+=("-DUSE_ZSTD=ON")
# Run build script from scripts if applicable
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
export ANDROID_NDK=/opt/ndk
"${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@"
exit 0
fi
if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
# click (required by onnx) wants these set
export LANG=C.UTF-8
export LC_ALL=C.UTF-8
# SKIP_CONDA_TESTS refers to only the 'test' section of the meta.yaml
export SKIP_CONDA_TESTS=1
export CONDA_INSTALL_LOCALLY=1
"${ROOT_DIR}/scripts/build_anaconda.sh" "$@"
# The tests all need hypothesis, tabulate, and pydot, which aren't included
# in the conda packages
conda install -y hypothesis tabulate pydot
# This build will be tested against onnx tests, which needs onnx installed.
# Onnx should be built against the same protobuf that Caffe2 uses, which is
# only installed in the conda environment when Caffe2 is.
# This path comes from install_anaconda.sh which installs Anaconda into the
# docker image
PROTOBUF_INCDIR=/opt/conda/include pip install "${ROOT_DIR}/third_party/onnx"
exit 0
fi
# Run cmake from ./build directory
mkdir -p ./build
cd ./build
INSTALL_PREFIX="/usr/local/caffe2"
CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
# Explicitly set Python executable.
# On Ubuntu 16.04 the default Python is still 2.7.
PYTHON="$(which python)"
if [[ "${BUILD_ENVIRONMENT}" == py3* ]]; then
PYTHON=/usr/bin/python3
CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
fi
case "${BUILD_ENVIRONMENT}" in
*-mkl*)
CMAKE_ARGS+=("-DBLAS=MKL")
;;
*-cuda*)
CMAKE_ARGS+=("-DUSE_CUDA=ON")
CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
# Add ccache symlink for nvcc
ln -sf "$(which ccache)" "${CCACHE_WRAPPER_DIR}/nvcc"
# Explicitly set path to NVCC such that the symlink to ccache is used
CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CCACHE_WRAPPER_DIR}/nvcc")
# Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
# Setting PATH to resolve to the right nvcc alone isn't enough.
# See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
export CUDA_PATH="/usr/local/cuda"
# Ensure the ccache symlink can still find the real nvcc binary.
export PATH="/usr/local/cuda/bin:$PATH"
;;
esac
# Try to include Redis support for Linux builds
if [ "$(uname)" == "Linux" ]; then
CMAKE_ARGS+=("-DUSE_REDIS=ON")
fi
# Currently, on Jenkins mac os, we will use custom protobuf. Mac OS
# contbuild at the moment is minimal dependency - it doesn't use glog
# or gflags either.
if [ "$(uname)" == "Darwin" ]; then
CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON")
fi
# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
# and use that if so.
if [[ -x "$(command -v cmake3)" ]]; then
CMAKE_BINARY=cmake3
else
CMAKE_BINARY=cmake
fi
# Configure
${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
# Build
if [ "$(uname)" == "Linux" ]; then
make "-j$(nproc)" install
else
echo "Don't know how to build on $(uname)"
exit 1
fi
# Install ONNX into a local directory
ONNX_INSTALL_PATH="/usr/local/onnx"
pip install "${ROOT_DIR}/third_party/onnx" -t "${ONNX_INSTALL_PATH}"
# Symlink the caffe2 base python path into the system python path,
# so that we can import caffe2 without having to change $PYTHONPATH.
# Run in a subshell to contain environment set by /etc/os-release.
#
# This is only done when running on Jenkins! We don't want to pollute
# the user environment with Python symlinks and ld.so.conf.d hacks.
#
if [ -n "${JENKINS_URL}" ]; then
(
source /etc/os-release
function python_version() {
"$PYTHON" -c 'import sys; print("python%d.%d" % sys.version_info[0:2])'
}
# Debian/Ubuntu
if [[ "$ID_LIKE" == *debian* ]]; then
python_path="/usr/local/lib/$(python_version)/dist-packages"
sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}"
fi
# RHEL/CentOS
if [[ "$ID_LIKE" == *rhel* ]]; then
python_path="/usr/lib64/$(python_version)/site-packages/"
sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}"
fi
# /etc/ld.so.conf.d is used on both Debian and RHEL
echo "${INSTALL_PREFIX}/lib" | sudo tee /etc/ld.so.conf.d/caffe2.conf
sudo ldconfig
)
fi

132
.jenkins/caffe2/test.sh Executable file
View File

@ -0,0 +1,132 @@
#!/bin/bash
set -ex
LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
# Figure out which Python to use
PYTHON="python"
if [ -n "$BUILD_ENVIRONMENT" ]; then
if [[ "$BUILD_ENVIRONMENT" == py2* ]]; then
PYTHON="python2"
elif [[ "$BUILD_ENVIRONMENT" == py3* ]]; then
PYTHON="python3"
fi
fi
# The prefix must mirror the setting from build.sh
INSTALL_PREFIX="/usr/local/caffe2"
# Anaconda builds have a special install prefix and python
if [[ "$BUILD_ENVIRONMENT" == conda* ]]; then
# This path comes from install_anaconda.sh which installs Anaconda into the
# docker image
PYTHON="/opt/conda/bin/python"
INSTALL_PREFIX="/opt/conda/"
# Testing requires separate packages
if [[ $BUILD_ENVIRONMENT == *gcc4* ]]; then
# These are from conda-forge
conda install -yc conda-forge hypothesis tabulate pydot networkx==2.0 click pytest scipy
# These packages are from the default channels
conda install -y opencv=3.1.0=np112py27_1 pil=1.1.7=py27_2
else
conda install -y hypothesis tabulate pydot
fi
# This build will be tested against onnx tests, which needs onnx installed.
# Onnx should be built against the same protobuf that Caffe2 uses, which is
# only installed in the conda environment when Caffe2 is.
# This path comes from install_anaconda.sh which installs Anaconda into the
# docker image
PROTOBUF_INCDIR=/opt/conda/include pip install "${ROOT_DIR}/third_party/onnx"
fi
# Add the site-packages in the caffe2 install prefix to the PYTHONPATH
SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))")
INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}"
# Skip tests in environments where they are not built/applicable
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
echo 'Skipping tests'
exit 0
fi
# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed
# Caffe2. This shouldn't be done on Anaconda, as Anaconda should handle this.
if [[ "$BUILD_ENVIRONMENT" != conda* ]]; then
export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib"
fi
exit_code=0
cd "$ROOT_DIR"
if [ -d ./test ]; then
echo "Directory ./test already exists; please remove it..."
exit 1
fi
mkdir -p ./test/{cpp,python}
TEST_DIR="$PWD/test"
cd ${INSTALL_PREFIX}
# Commands below may exit with non-zero status
set +e
# C++ tests
echo "Running C++ tests.."
for test in ./test/*; do
# Skip tests we know are hanging or bad
case "$(basename "$test")" in
mkl_utils_test)
continue
;;
# TODO investigate conv_op_test failures when using MKL
conv_op_test)
continue
;;
esac
"$test" --gtest_output=xml:"$TEST_DIR"/cpp/$(basename "$test").xml
tmp_exit_code="$?"
if [ "$exit_code" -eq 0 ]; then
exit_code="$tmp_exit_code"
fi
done
# Get the relative path to where the caffe2 python module was installed
CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2"
# Collect additional tests to run (outside caffe2/python)
EXTRA_TESTS=()
# CUDA builds always include NCCL support
if [[ "$BUILD_ENVIRONMENT" == *-cuda* ]]; then
EXTRA_TESTS+=("$CAFFE2_PYPATH/contrib/nccl")
fi
# Python tests
echo "Running Python tests.."
"$PYTHON" \
-m pytest \
-x \
-v \
--junit-xml="$TEST_DIR/python/result.xml" \
--ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \
--ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \
--ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \
--ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \
"$CAFFE2_PYPATH/python" \
"${EXTRA_TESTS[@]}"
tmp_exit_code="$?"
if [ "$exit_code" -eq 0 ]; then
exit_code="$tmp_exit_code"
fi
# Exit with the first non-zero status we got
exit "$exit_code"

286
CMakeLists.txt Normal file
View File

@ -0,0 +1,286 @@
cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
#cmake_policy(SET CMP0022 NEW)
#cmake_policy(SET CMP0023 NEW)
# ---[ Project and semantic versioning.
project(Caffe2 CXX C)
set(CAFFE2_VERSION_MAJOR 0)
set(CAFFE2_VERSION_MINOR 8)
set(CAFFE2_VERSION_PATCH 2)
set(CAFFE2_VERSION
"${CAFFE2_VERSION_MAJOR}.${CAFFE2_VERSION_MINOR}.${CAFFE2_VERSION_PATCH}")
# One variable that determines whether the current cmake process is being run
# with the main Caffe2 library. This is useful for building modules - if
# modules are built with the main Caffe2 library then one does not need to do
# find caffe2 in the cmake script. One can usually guard it in some way like
# if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
# find_package(Caffe2 REQUIRED)
# endif()
set(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO ON)
# ---[ Options.
# Note to developers: if you add an option below, make sure you also add it to
# cmake/Summary.cmake so that the summary prints out the option values.
include(CMakeDependentOption)
option(BUILD_BINARY "Build C++ binaries" ON)
option(BUILD_DOCS "Build documentation" OFF)
option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" OFF)
option(BUILD_PYTHON "Build Python binaries" ON)
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
cmake_dependent_option(
CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
"BUILD_SHARED_LIBS AND BUILD_CUSTOM_PROTOBUF" OFF)
cmake_dependent_option(
CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
"NOT BUILD_SHARED_LIBS" OFF)
option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" ON)
option(USE_ACL "Use ARM Compute Library" OFF)
option(USE_ASAN "Use Address Sanitizer" OFF)
option(USE_ATEN "Use ATen" OFF)
option(USE_CUDA "Use Cuda" ON)
option(USE_FFMPEG "Use ffmpeg" OFF)
option(USE_GFLAGS "Use GFLAGS" ON)
option(USE_GLOG "Use GLOG" ON)
option(USE_GLOO "Use Gloo" ON)
option(USE_LEVELDB "Use LEVELDB" ON)
option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
option(USE_LMDB "Use LMDB" ON)
option(USE_METAL "Use Metal for iOS build" ON)
option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
option(USE_MPI "Use MPI" ON)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
option(USE_NCCL "Use NCCL" ON)
option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF)
option(USE_NNAPI "Use NNAPI" OFF)
option(USE_NNPACK "Use NNPACK" ON)
option(USE_NUMA "Use NUMA (only available on Linux)" ON)
option(USE_OBSERVERS "Use observers module." OFF)
option(USE_OPENCV "Use openCV" ON)
option(USE_OPENMP "Use OpenMP for parallel code" OFF)
option(USE_PROF "Use profiling" OFF)
option(USE_REDIS "Use Redis" OFF)
option(USE_ROCKSDB "Use RocksDB" OFF)
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
option(USE_ZMQ "Use ZMQ" OFF)
option(USE_ZSTD "Use ZSTD" OFF)
# ---[ CMake scripts + modules
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
if (MSVC AND ${BUILD_SHARED_LIBS})
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
# ---[ CMake build directories
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
enable_testing()
# ---[ Misc checks to cope with various compiler modes
include(cmake/MiscCheck.cmake)
include(cmake/BuildVariables.cmake)
# External projects
include(ExternalProject)
# TODO: merge the following 3 files into cmake/public/utils.cmake.
include(cmake/Utils.cmake)
include(cmake/public/utils.cmake)
set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.")
# Set default build type
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Build type not set - defaulting to Release")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE)
endif()
# ---[ Dependencies
include(cmake/Dependencies.cmake)
# ---[ Whitelist file if whitelist is specified
include(cmake/Whitelist.cmake)
# ---[ Set link flag, handle additional deps for gcc 4.8 and above
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.8.0 AND NOT ANDROID)
message(STATUS "GCC ${CMAKE_CXX_COMPILER_VERSION}: Adding gcc and gcc_s libs to link line")
list(APPEND Caffe2_DEPENDENCY_LIBS gcc_s gcc)
endif()
# ---[ Build flags
set(CMAKE_C_STANDARD 99)
set(CMAKE_CXX_STANDARD 11)
if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
# Eigen fails to build with some versions, so convert this to a warning
# Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization")
else()
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
if (${CAFFE2_USE_MSVC_STATIC_RUNTIME})
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
else()
if(${flag_var} MATCHES "/MT")
string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
endif()
endif()
set(${flag_var} "${${flag_var}} /MP /bigobj")
endforeach(flag_var)
endif()
if(ANDROID)
if(CMAKE_COMPILER_IS_GNUCXX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s")
else()
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s")
endif()
endif()
if(NOT APPLE AND UNIX)
list(APPEND Caffe2_DEPENDENCY_LIBS dl)
endif()
# Prefix path to Caffe2 headers.
# If a directory containing installed Caffe2 headers was inadvertently
# added to the list of include directories, prefixing
# PROJECT_SOURCE_DIR means this source tree always takes precedence.
include_directories(BEFORE ${PROJECT_SOURCE_DIR})
# Prefix path to generated Caffe2 headers.
# These need to take precedence over their empty counterparts located
# in PROJECT_SOURCE_DIR.
include_directories(BEFORE ${PROJECT_BINARY_DIR})
# ---[ Old caffe protobuf.
add_subdirectory(caffe/proto)
# ---[ Main build
add_subdirectory(caffe2)
# Documentation Option
if(BUILD_DOCS)
# check if Doxygen is installed
find_package(Doxygen)
if (DOXYGEN_FOUND)
message("Generating documentation")
set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
add_custom_target(doc_doxygen_c ALL
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating C++ API documentation with Doxygen"
VERBATIM)
add_custom_target(doc_doxygen_python ALL
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating Python API documentation with Doxygen"
VERBATIM)
else (DOXYGEN_FOUND)
message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
endif (DOXYGEN_FOUND)
endif (BUILD_DOCS)
# ---[ CMake related files
# Uninistall option.
if(NOT TARGET caffe2_uninstall)
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
IMMEDIATE @ONLY)
add_custom_target(caffe2_uninstall
COMMAND ${CMAKE_COMMAND} -P
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
endif()
# ---[ Make configuration files for cmake to allow dependent libraries
# easier access to Caffe2.
if ((NOT USE_GLOG) OR (NOT USE_GFLAGS) OR BUILD_CUSTOM_PROTOBUF)
message(WARNING
"Generated cmake files are only fully tested if one builds "
"with system glog, gflags, and protobuf. Other settings may "
"generate files that are not well tested.")
endif()
if (USE_CUDA)
# TODO: check if we should include other cuda dependency libraries
# to the interface as well.
endif()
# Note(jiayq): when building static libraries, all PRIVATE dependencies
# will also become interface libraries, and as a result if there are any
# dependency libraries that are not exported, the following install export
# script will fail. As a result, we will only provide the targets cmake
# files for shared lib installation. For more info, read:
# https://cmake.org/pipermail/cmake/2016-May/063400.html
if (BUILD_SHARED_LIBS)
configure_file(
${PROJECT_SOURCE_DIR}/cmake/Caffe2ConfigVersion.cmake.in
${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
@ONLY)
configure_file(
${PROJECT_SOURCE_DIR}/cmake/Caffe2Config.cmake.in
${PROJECT_BINARY_DIR}/Caffe2Config.cmake
@ONLY)
install(FILES
${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
${PROJECT_BINARY_DIR}/Caffe2Config.cmake
DESTINATION share/cmake/Caffe2
COMPONENT dev)
install(FILES
${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake
${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake
${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake
${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
DESTINATION share/cmake/Caffe2/public
COMPONENT dev)
install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
FILE Caffe2Targets.cmake
COMPONENT dev)
else()
message(WARNING
"Generated cmake files are only available when building "
"shared libs.")
endif()
# ---[ Modules
add_subdirectory(modules)
# ---[ Binaries
# Binaries will be built after the Caffe2 main libraries and the modules
# are built. For the binaries, they will be linked to the Caffe2 main
# libraries, as well as all the modules that are built with Caffe2 (the ones
# built in the previous Modules section above).
if (BUILD_BINARY)
add_subdirectory(binaries)
endif()
include(cmake/Summary.cmake)
caffe2_print_configuration_summary()

10
LICENSE
View File

@ -1,13 +1,3 @@
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
All rights reserved.
Redistribution and use in source and binary forms, with or without

21
Makefile Normal file
View File

@ -0,0 +1,21 @@
# This makefile does nothing but delegating the actual building to cmake.
all:
@mkdir -p build && cd build && cmake .. $(shell python ./scripts/get_python_cmake_flags.py) && $(MAKE)
local:
@./scripts/build_local.sh
android:
@./scripts/build_android.sh
ios:
@./scripts/build_ios.sh
clean: # This will remove ALL build folders.
@rm -r build*/
linecount:
@cloc --read-lang-def=caffe.cloc caffe2 || \
echo "Cloc is not available on the machine. You can install cloc with " && \
echo " sudo apt-get install cloc"

351
NOTICE Normal file
View File

@ -0,0 +1,351 @@
From PyTorch:
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
From Caffe2:
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
All contributions by Facebook:
Copyright (c) 2016 Facebook Inc.
All contributions by Google:
Copyright (c) 2015 Google Inc.
All rights reserved.
All contributions by Yangqing Jia:
Copyright (c) 2015 Yangqing Jia
All rights reserved.
All contributions from Caffe:
Copyright(c) 2013, 2014, 2015, the respective contributors
All rights reserved.
All other contributions:
Copyright(c) 2015, 2016 the respective contributors
All rights reserved.
Caffe2 uses a copyright model similar to Caffe: each contributor holds
copyright over their contributions to Caffe2. The project versioning records
all such contribution and copyright details. If a contributor wants to further
mark their specific copyright on a particular contribution, they should
indicate their copyright solely in the commit message of the change when it is
committed.
=======================================================================
Software under third_party
=======================================================================
Software libraries under third_party are provided as github submodule
links, and their content is not part of the Caffe2 codebase. Their
licences can be found under the respective software repositories.
=======================================================================
Earlier BSD License
=======================================================================
Early development of Caffe2 in 2015 and early 2016 is licensed under the
BSD license. The license is attached below:
All contributions by Facebook:
Copyright (c) 2016 Facebook Inc.
All contributions by Google:
Copyright (c) 2015 Google Inc.
All rights reserved.
All contributions by Yangqing Jia:
Copyright (c) 2015 Yangqing Jia
All rights reserved.
All other contributions:
Copyright(c) 2015, 2016 the respective contributors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
=======================================================================
Caffe's BSD License
=======================================================================
Some parts of the caffe2 code is derived from the original Caffe code, which is
created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe
license is as follows:
COPYRIGHT
All contributions by the University of California:
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
All other contributions:
Copyright (c) 2014, the respective contributors
All rights reserved.
Caffe uses a shared copyright model: each contributor holds copyright over
their contributions to Caffe. The project versioning records all such
contribution and copyright details. If a contributor wants to further mark
their specific copyright on a particular contribution, they should indicate
their copyright solely in the commit message of the change when it is
committed.
LICENSE
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CONTRIBUTION AGREEMENT
By contributing to the BVLC/caffe repository through pull-request, comment,
or otherwise, the contributor releases their content to the
license and copyright terms herein.
=======================================================================
Caffe2's Apache License
=======================================================================
This repo contains Caffe2 code, which was previously licensed under
Apache License Version 2.0:
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

52
binaries/CMakeLists.txt Normal file
View File

@ -0,0 +1,52 @@
caffe2_binary_target("convert_caffe_image_db.cc")
caffe2_binary_target("convert_db.cc")
caffe2_binary_target("make_cifar_db.cc")
caffe2_binary_target("make_mnist_db.cc")
caffe2_binary_target("predictor_verifier.cc")
caffe2_binary_target("print_registered_core_operators.cc")
caffe2_binary_target("run_plan.cc")
caffe2_binary_target("speed_benchmark.cc")
caffe2_binary_target("split_db.cc")
caffe2_binary_target("db_throughput.cc")
if (USE_CUDA)
caffe2_binary_target("inspect_gpus.cc")
target_link_libraries(inspect_gpus ${CUDA_LIBRARIES})
caffe2_binary_target("print_core_object_sizes.cc")
if (BUILD_TEST)
# Core overhead benchmark
caffe2_binary_target("core_overhead_benchmark.cc")
target_link_libraries(core_overhead_benchmark benchmark ${CUDA_curand_LIBRARY})
endif()
endif()
if (USE_ZMQ)
caffe2_binary_target("zmq_feeder.cc")
target_link_libraries(zmq_feeder ${ZMQ_LIBRARIES})
endif()
if(USE_MPI)
caffe2_binary_target("run_plan_mpi.cc")
target_link_libraries(run_plan_mpi ${MPI_CXX_LIBRARIES})
endif()
if (USE_OPENCV AND USE_LEVELDB)
caffe2_binary_target("convert_encoded_to_raw_leveldb.cc")
target_link_libraries(
convert_encoded_to_raw_leveldb
${OpenCV_LIBS} ${LevelDB_LIBRARIES} ${Snappy_LIBRARIES})
endif()
if (USE_OPENCV)
caffe2_binary_target("make_image_db.cc")
target_link_libraries(make_image_db ${OpenCV_LIBS})
endif()
if (USE_OBSERVERS)
caffe2_binary_target("caffe2_benchmark.cc")
endif()
# ---[ tutorials
caffe2_binary_target("tutorial_blob.cc")

View File

@ -0,0 +1,241 @@
#include <fstream>
#include <iterator>
#include <string>
#include "caffe2/core/blob_serialization.h"
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/utils/string_utils.h"
#include "observers/observer_config.h"
CAFFE2_DEFINE_string(
backend,
"builtin",
"The backend to use when running the model. The allowed "
"backend choices are: builtin, default, nnpack, eigen, mkl");
CAFFE2_DEFINE_string(
init_net,
"",
"The given net to initialize any parameters.");
CAFFE2_DEFINE_string(
input,
"",
"Input that is needed for running the network. If "
"multiple input needed, use comma separated string.");
CAFFE2_DEFINE_string(
input_dims,
"",
"Alternate to input_files, if all inputs are simple "
"float TensorCPUs, specify the dimension using comma "
"separated numbers. If multiple input needed, use "
"semicolon to separate the dimension of different "
"tensors.");
CAFFE2_DEFINE_string(
input_file,
"",
"Input file that contain the serialized protobuf for "
"the input blobs. If multiple input needed, use comma "
"separated string. Must have the same number of items "
"as input does.");
CAFFE2_DEFINE_string(
input_type,
"float",
"Input type when specifying the input dimension."
"The supported types are float, uint8_t.");
CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
CAFFE2_DEFINE_string(
output,
"",
"Output that should be dumped after the execution "
"finishes. If multiple outputs are needed, use comma "
"separated string. If you want to dump everything, pass "
"'*' as the output value.");
CAFFE2_DEFINE_string(
output_folder,
"",
"The folder that the output should be written to. This "
"folder must already exist in the file system.");
CAFFE2_DEFINE_bool(
run_individual,
false,
"Whether to benchmark individual operators.");
CAFFE2_DEFINE_bool(
text_output,
false,
"Whether to write out output in text format for regression purpose.");
CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
using std::string;
using std::unique_ptr;
using std::vector;
static void writeTextOutput(
caffe2::TensorCPU* tensor,
const string& output_prefix,
const string& name) {
string output_name = output_prefix + "/" + name + ".txt";
caffe2::TensorSerializer<caffe2::CPUContext> ser;
caffe2::BlobProto blob_proto;
ser.Serialize(
*tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
blob_proto.set_name(output_name);
blob_proto.set_type("Tensor");
CAFFE_ENFORCE(blob_proto.has_tensor());
caffe2::TensorProto tensor_proto = blob_proto.tensor();
vector<float> data;
switch (tensor_proto.data_type()) {
case caffe2::TensorProto::FLOAT: {
std::copy(
tensor_proto.float_data().begin(),
tensor_proto.float_data().end(),
std::back_inserter(data));
break;
}
case caffe2::TensorProto::INT32: {
std::copy(
tensor_proto.int32_data().begin(),
tensor_proto.int32_data().end(),
std::back_inserter(data));
break;
}
default:
CAFFE_THROW("Unimplemented Blob type.");
}
std::ofstream output_file(output_name);
std::ostream_iterator<float> output_iterator(output_file, "\n");
std::copy(data.begin(), data.end(), output_iterator);
}
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::ShowLogInfoToStderr();
unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
// Run initialization network.
caffe2::NetDef init_net_def;
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def));
CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
// Load input.
if (caffe2::FLAGS_input.size()) {
vector<string> input_names = caffe2::split(',', caffe2::FLAGS_input);
if (caffe2::FLAGS_input_file.size()) {
vector<string> input_files = caffe2::split(',', caffe2::FLAGS_input_file);
CAFFE_ENFORCE_EQ(
input_names.size(),
input_files.size(),
"Input name and file should have the same number.");
for (int i = 0; i < input_names.size(); ++i) {
caffe2::BlobProto blob_proto;
CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
workspace->CreateBlob(input_names[i])->Deserialize(blob_proto);
}
} else if (caffe2::FLAGS_input_dims.size()) {
vector<string> input_dims_list =
caffe2::split(';', caffe2::FLAGS_input_dims);
CAFFE_ENFORCE_EQ(
input_names.size(),
input_dims_list.size(),
"Input name and dims should have the same number of items.");
for (int i = 0; i < input_names.size(); ++i) {
vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
vector<int> input_dims;
for (const string& s : input_dims_str) {
input_dims.push_back(caffe2::stoi(s));
}
if (!workspace->HasBlob(input_names[i])) {
workspace->CreateBlob(input_names[i]);
}
caffe2::TensorCPU* tensor =
workspace->GetBlob(input_names[i])->GetMutable<caffe2::TensorCPU>();
tensor->Resize(input_dims);
if (caffe2::FLAGS_input_type == "float") {
tensor->mutable_data<float>();
} else {
CAFFE_ENFORCE(
caffe2::FLAGS_input_type == "uint8_t",
"Only supported input types are: float, uint8_t");
tensor->mutable_data<uint8_t>();
}
}
} else {
CAFFE_THROW(
"You requested input tensors, but neither input_file nor "
"input_dims is set.");
}
}
// Run main network.
caffe2::NetDef net_def;
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
if (caffe2::FLAGS_backend != "builtin") {
std::string engine = caffe2::FLAGS_backend == "nnpack"
? "NNPACK"
: caffe2::FLAGS_backend == "eigen" ? "EIGEN"
: caffe2::FLAGS_backend == "mkl"
? "MKLDNN"
: caffe2::FLAGS_backend == "default" ? "" : "NONE";
CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
for (int i = 0; i < net_def.op_size(); i++) {
caffe2::OperatorDef* op_def = net_def.mutable_op(i);
op_def->set_engine(engine);
}
}
caffe2::NetBase* net = workspace->CreateNet(net_def);
CHECK_NOTNULL(net);
LOG(INFO) << "Starting benchmark.";
caffe2::ObserverConfig::initSampleRate(
1, 1, 1, caffe2::FLAGS_run_individual, caffe2::FLAGS_warmup);
LOG(INFO) << "Running warmup runs.";
for (int i = 0; i < caffe2::FLAGS_warmup; ++i) {
CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
}
LOG(INFO) << "Main runs.";
CAFFE_ENFORCE(
caffe2::FLAGS_iter >= 0,
"Number of main runs should be non negative, provided ",
caffe2::FLAGS_iter,
".");
for (int i = 0; i < caffe2::FLAGS_iter; ++i) {
caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, caffe2::FLAGS_warmup);
CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
if (caffe2::FLAGS_run_individual) {
caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, caffe2::FLAGS_warmup);
CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
}
}
string output_prefix = caffe2::FLAGS_output_folder.size()
? caffe2::FLAGS_output_folder + "/"
: "";
if (caffe2::FLAGS_output.size()) {
vector<string> output_names = caffe2::split(',', caffe2::FLAGS_output);
if (caffe2::FLAGS_output == "*") {
output_names = workspace->Blobs();
}
for (const string& name : output_names) {
CAFFE_ENFORCE(
workspace->HasBlob(name),
"You requested a non-existing blob: ",
name);
if (caffe2::FLAGS_text_output) {
auto blob = workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>();
writeTextOutput(blob, output_prefix, name);
} else {
string serialized = workspace->GetBlob(name)->Serialize(name);
string output_filename = output_prefix + name;
caffe2::WriteStringToFile(serialized, output_filename.c_str());
}
}
}
return 0;
}

View File

@ -0,0 +1,90 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe/proto/caffe.pb.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(input_db, "", "The input db.");
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
CAFFE2_DEFINE_string(output_db, "", "The output db.");
CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
using caffe2::db::Cursor;
using caffe2::db::DB;
using caffe2::db::Transaction;
using caffe2::TensorProto;
using caffe2::TensorProtos;
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
int count = 0;
for (; cursor->Valid(); cursor->Next()) {
caffe::Datum datum;
CAFFE_ENFORCE(datum.ParseFromString(cursor->value()));
TensorProtos protos;
TensorProto* data = protos.add_protos();
TensorProto* label = protos.add_protos();
label->set_data_type(TensorProto::INT32);
label->add_dims(1);
label->add_int32_data(datum.label());
if (datum.encoded()) {
// This is an encoded image. we will copy over the data directly.
data->set_data_type(TensorProto::STRING);
data->add_dims(1);
data->add_string_data(datum.data());
} else {
// float data not supported right now.
CAFFE_ENFORCE_EQ(datum.float_data_size(), 0);
std::vector<char> buffer_vec(datum.data().size());
char* buffer = buffer_vec.data();
// swap order from CHW to HWC
int channels = datum.channels();
int size = datum.height() * datum.width();
CAFFE_ENFORCE_EQ(datum.data().size(), channels * size);
for (int c = 0; c < channels; ++c) {
char* dst = buffer + c;
const char* src = datum.data().c_str() + c * size;
for (int n = 0; n < size; ++n) {
dst[n*channels] = src[n];
}
}
data->set_data_type(TensorProto::BYTE);
data->add_dims(datum.height());
data->add_dims(datum.width());
data->add_dims(datum.channels());
data->set_byte_data(buffer, datum.data().size());
}
transaction->Put(cursor->key(), protos.SerializeAsString());
if (++count % caffe2::FLAGS_batch_size == 0) {
transaction->Commit();
LOG(INFO) << "Converted " << count << " items so far.";
}
}
LOG(INFO) << "A total of " << count << " items processed.";
return 0;
}

51
binaries/convert_db.cc Normal file
View File

@ -0,0 +1,51 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(input_db, "", "The input db.");
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
CAFFE2_DEFINE_string(output_db, "", "The output db.");
CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
using caffe2::db::Cursor;
using caffe2::db::DB;
using caffe2::db::Transaction;
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
int count = 0;
for (; cursor->Valid(); cursor->Next()) {
transaction->Put(cursor->key(), cursor->value());
if (++count % caffe2::FLAGS_batch_size == 0) {
transaction->Commit();
LOG(INFO) << "Converted " << count << " items so far.";
}
}
LOG(INFO) << "A total of " << count << " items processed.";
return 0;
}

View File

@ -0,0 +1,156 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This script converts an image dataset to leveldb.
//
// caffe2::FLAGS_input_folder is the root folder that holds all the images, and
// caffe2::FLAGS_list_file should be a list of files as well as their labels, in the
// format as
// subfolder1/file1.JPEG 7
// ....
#include <opencv2/opencv.hpp>
#include <fstream> // NOLINT(readability/streams)
#include <memory>
#include <random>
#include <string>
#include "caffe2/core/init.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
#include "leveldb/db.h"
#include "leveldb/write_batch.h"
CAFFE2_DEFINE_string(input_db_name, "", "The input image file name.");
CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
CAFFE2_DEFINE_int(scale, 256,
"If caffe2::FLAGS_raw is set, scale all the images' shorter edge to the given "
"value.");
CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
namespace caffe2 {
using std::string;
using std::unique_ptr;
void ConvertToRawDataset(
const string& input_db_name, const string& output_db_name) {
// input leveldb
std::unique_ptr<leveldb::DB> input_db;
LOG(INFO) << "Opening input leveldb " << input_db_name;
{
leveldb::Options options;
options.create_if_missing = false;
leveldb::DB* db_temp;
leveldb::Status status = leveldb::DB::Open(
options, input_db_name, &db_temp);
CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, ".");
input_db.reset(db_temp);
}
// output leveldb
std::unique_ptr<leveldb::DB> output_db;
std::unique_ptr<leveldb::WriteBatch> batch;
LOG(INFO) << "Opening leveldb " << output_db_name;
{
leveldb::Options options;
options.error_if_exists = true;
options.create_if_missing = true;
options.write_buffer_size = 268435456;
leveldb::DB* db_temp;
leveldb::Status status = leveldb::DB::Open(
options, output_db_name, &db_temp);
CAFFE_ENFORCE(
status.ok(),
"Failed to open leveldb ",
output_db_name,
". Is it already existing?");
output_db.reset(db_temp);
}
batch.reset(new leveldb::WriteBatch());
TensorProtos input_protos;
TensorProtos output_protos;
TensorProto* data = output_protos.add_protos();
TensorProto* label = output_protos.add_protos();
data->set_data_type(TensorProto::BYTE);
data->add_dims(0);
data->add_dims(0);
if (caffe2::FLAGS_color) {
data->add_dims(3);
}
string value;
unique_ptr<leveldb::Iterator> iter;
iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
iter->SeekToFirst();
int count = 0;
for (; iter->Valid(); iter->Next()) {
CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString()));
label->CopyFrom(input_protos.protos(1));
const string& encoded_image = input_protos.protos(0).string_data(0);
int encoded_size = encoded_image.size();
cv::Mat img = cv::imdecode(
cv::Mat(1, &encoded_size, CV_8UC1,
const_cast<char*>(encoded_image.data())),
caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
cv::Mat resized_img;
int scaled_width, scaled_height;
if (caffe2::FLAGS_warp) {
scaled_width = caffe2::FLAGS_scale;
scaled_height = caffe2::FLAGS_scale;
} else if (img.rows > img.cols) {
scaled_width = caffe2::FLAGS_scale;
scaled_height = static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
} else {
scaled_height = caffe2::FLAGS_scale;
scaled_width = static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
}
cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
cv::INTER_LINEAR);
data->set_dims(0, scaled_height);
data->set_dims(1, scaled_width);
DCHECK(resized_img.isContinuous());
data->set_byte_data(resized_img.ptr(),
scaled_height * scaled_width * (caffe2::FLAGS_color ? 3 : 1));
output_protos.SerializeToString(&value);
// Put in db
batch->Put(iter->key(), value);
if (++count % 1000 == 0) {
output_db->Write(leveldb::WriteOptions(), batch.get());
batch.reset(new leveldb::WriteBatch());
LOG(INFO) << "Processed " << count << " files.";
}
}
// write the last batch
if (count % 1000 != 0) {
output_db->Write(leveldb::WriteOptions(), batch.get());
}
LOG(INFO) << "Processed a total of " << count << " files.";
}
} // namespace caffe2
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::ConvertToRawDataset(
caffe2::FLAGS_input_db_name, caffe2::FLAGS_output_db_name);
return 0;
}

View File

@ -0,0 +1,223 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "benchmark/benchmark.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#define CAFFE2_SKIP_IF_NO_GPU \
if (!caffe2::NumCudaDevices()) { \
state.SkipWithError("No CUDA available, skipping benchmark."); \
return; \
}
using namespace caffe2;
static void BM_CUDAContextCreation(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
volatile CUDAContext context_so_we_do_initialization_work;
while (state.KeepRunning()) {
volatile CUDAContext context;
}
}
BENCHMARK(BM_CUDAContextCreation);
static void BM_CUDAContextStreamAccess(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
CUDAContext context;
while (state.KeepRunning()) {
volatile cudaStream_t stream = context.cuda_stream();
}
}
BENCHMARK(BM_CUDAContextStreamAccess);
static void BM_cudaGetDevice(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
int id;
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaGetDevice(&id));
}
}
BENCHMARK(BM_cudaGetDevice);
static void BM_cudaSetDevice(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
int total = NumCudaDevices();
int i = 0;
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaSetDevice((i++) % total));
}
}
BENCHMARK(BM_cudaSetDevice);
static void BM_cudaSetAndGetDevice(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
int total = NumCudaDevices();
int i = 0;
int id;
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaSetDevice((i++) % total));
CUDA_ENFORCE(cudaGetDevice(&id));
}
}
BENCHMARK(BM_cudaSetAndGetDevice);
static void BM_cudaSetSameDevice(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaSetDevice(0));
}
}
BENCHMARK(BM_cudaSetSameDevice);
static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
cudaStream_t stream;
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaStreamCreate(&stream));
CUDA_ENFORCE(cudaStreamSynchronize(stream));
CUDA_ENFORCE(cudaStreamDestroy(stream));
}
}
BENCHMARK(BM_cudaStreamCreateSyncDelete);
static void BM_cudaStreamSynchronize(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
cudaStream_t stream;
CUDA_ENFORCE(cudaStreamCreate(&stream));
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaStreamSynchronize(stream));
}
}
BENCHMARK(BM_cudaStreamSynchronize);
static void BM_cudaEventRecord(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
cudaStream_t stream;
cudaEvent_t event;
CUDA_ENFORCE(cudaStreamCreate(&stream));
CUDA_ENFORCE(cudaEventCreateWithFlags(
&event, cudaEventDefault | cudaEventDisableTiming));
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaEventRecord(event, stream));
}
}
BENCHMARK(BM_cudaEventRecord);
static void BM_cudaStreamWaitEventThenStreamSynchronize(
benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
cudaStream_t stream;
cudaEvent_t event;
CUDA_ENFORCE(cudaStreamCreate(&stream));
CUDA_ENFORCE(cudaEventCreateWithFlags(
&event, cudaEventDefault | cudaEventDisableTiming));
CUDA_ENFORCE(cudaEventRecord(event, stream));
CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
CUDA_ENFORCE(cudaStreamSynchronize(stream));
while (state.KeepRunning()) {
CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
CUDA_ENFORCE(cudaStreamSynchronize(stream));
}
}
BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
static void BM_CudaPointerAffinity(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
float* ptr = tensor.mutable_data<float>();
while (state.KeepRunning()) {
volatile int id = GetGPUIDForPointer(ptr);
}
}
BENCHMARK(BM_CudaPointerAffinity);
namespace {
template <class Context>
class DummyEmptyOp : public Operator<Context> {
public:
DummyEmptyOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
bool RunOnDevice() final { return true; }
};
REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
OPERATOR_SCHEMA(DummyEmpty);
} // namespace
static void BM_OperatorCreationCPU(benchmark::State& state) {
std::unique_ptr<OperatorBase> op;
OperatorDef def;
Workspace ws;
def.set_type("DummyEmpty");
def.mutable_device_option()->set_device_type(CPU);
while (state.KeepRunning()) {
op = CreateOperator(def, &ws);
}
}
BENCHMARK(BM_OperatorCreationCPU);
static void BM_OperatorCreationCUDA(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
std::unique_ptr<OperatorBase> op;
OperatorDef def;
Workspace ws;
def.set_type("DummyEmpty");
def.mutable_device_option()->set_device_type(CUDA);
while (state.KeepRunning()) {
op = CreateOperator(def, &ws);
}
}
BENCHMARK(BM_OperatorCreationCUDA);
static void BM_RawAllocDeallocCPU(benchmark::State& state) {
while (state.KeepRunning()) {
// Allocating only 1 byte in order to measure the overhead.
auto ptr_and_deleter = GetCPUAllocator()->New(1);
// Deallocate.
ptr_and_deleter.second(ptr_and_deleter.first);
}
}
BENCHMARK(BM_RawAllocDeallocCPU);
static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
Tensor<CPUContext> tensor;
// small allocation
tensor.Resize(32, 32);
while (state.KeepRunning()) {
CHECK(tensor.mutable_data<float>());
tensor.FreeMemory();
}
}
BENCHMARK(BM_TensorAllocDeallocCPU);
static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
Tensor<CUDAContext> tensor;
// small allocation
tensor.Resize(32, 32);
while (state.KeepRunning()) {
CHECK(tensor.mutable_data<float>());
tensor.FreeMemory();
}
}
BENCHMARK(BM_TensorAllocDeallocCUDA);
BENCHMARK_MAIN()

98
binaries/db_throughput.cc Normal file
View File

@ -0,0 +1,98 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstdio>
#include <thread>
#include <vector>
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/core/timer.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(input_db, "", "The input db.");
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
CAFFE2_DEFINE_int(report_interval, 1000, "The report interval.");
CAFFE2_DEFINE_int(repeat, 10, "The number to repeat the throughput test.");
CAFFE2_DEFINE_bool(use_reader, false, "If true, use the reader interface.");
CAFFE2_DEFINE_int(num_read_threads, 1,
"The number of concurrent reading threads.");
using caffe2::db::Cursor;
using caffe2::db::DB;
using caffe2::db::DBReader;
using caffe2::string;
void TestThroughputWithDB() {
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
caffe2::Timer timer;
for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
string key = cursor->key();
string value = cursor->value();
//VLOG(1) << "Key " << key;
cursor->Next();
if (!cursor->Valid()) {
cursor->SeekToFirst();
}
}
double elapsed_seconds = timer.Seconds();
printf("Iteration %03d, took %4.5f seconds, throughput %f items/sec.\n",
iter_id, elapsed_seconds,
caffe2::FLAGS_report_interval / elapsed_seconds);
}
}
void TestThroughputWithReaderWorker(const DBReader* reader, int thread_id) {
string key, value;
for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
caffe2::Timer timer;
for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
reader->Read(&key, &value);
}
double elapsed_seconds = timer.Seconds();
printf("Thread %03d iteration %03d, took %4.5f seconds, "
"throughput %f items/sec.\n",
thread_id, iter_id, elapsed_seconds,
caffe2::FLAGS_report_interval / elapsed_seconds);
}
}
void TestThroughputWithReader() {
caffe2::db::DBReader reader(
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db);
std::vector<std::unique_ptr<std::thread>> reading_threads(
caffe2::FLAGS_num_read_threads);
for (int i = 0; i < reading_threads.size(); ++i) {
reading_threads[i].reset(new std::thread(
TestThroughputWithReaderWorker, &reader, i));
}
for (int i = 0; i < reading_threads.size(); ++i) {
reading_threads[i]->join();
}
}
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
if (caffe2::FLAGS_use_reader) {
TestThroughputWithReader();
} else {
TestThroughputWithDB();
}
return 0;
}

57
binaries/inspect_gpus.cc Normal file
View File

@ -0,0 +1,57 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cuda_runtime.h>
#include <sstream>
#include <vector>
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
using std::vector;
CAFFE2_DECLARE_int(caffe2_log_level);
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::SetUsageMessage(
"Inspects the GPUs on the current machine and prints out their details "
"provided by cuda.");
int gpu_count;
CUDA_ENFORCE(cudaGetDeviceCount(&gpu_count));
for (int i = 0; i < gpu_count; ++i) {
LOG(INFO) << "Querying device ID = " << i;
caffe2::DeviceQuery(i);
}
vector<vector<bool> > access_pattern;
CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern));
std::stringstream sstream;
// Find topology
for (int i = 0; i < gpu_count; ++i) {
for (int j = 0; j < gpu_count; ++j) {
sstream << (access_pattern[i][j] ? "+" : "-") << " ";
}
sstream << std::endl;
}
LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
return 0;
}

148
binaries/make_cifar_db.cc Normal file
View File

@ -0,0 +1,148 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//
// This script converts the CIFAR dataset to the leveldb format used
// by caffe to perform classification.
// Usage:
// convert_cifar_data input_folder output_db_file
// The CIFAR dataset could be downloaded at
// http://www.cs.toronto.edu/~kriz/cifar.html
#include <array>
#include <fstream> // NOLINT(readability/streams)
#include <sstream>
#include <string>
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(input_folder, "", "The input folder name.");
CAFFE2_DEFINE_string(output_train_db_name,
"", "The output training db name.");
CAFFE2_DEFINE_string(output_test_db_name,
"", "The output testing db name.");
CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
CAFFE2_DEFINE_bool(is_cifar100, false,
"If set, convert cifar100. Otherwise do cifar10.");
namespace caffe2 {
using std::stringstream;
const int kCIFARSize = 32;
const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
const int kCIFAR10BatchSize = 10000;
const int kCIFAR10TestDataSize = 10000;
const int kCIFAR10TrainBatches = 5;
const int kCIFAR100TrainDataSize = 50000;
const int kCIFAR100TestDataSize = 10000;
void ReadImage(std::ifstream* file, int* label, char* buffer) {
char label_char;
if (caffe2::FLAGS_is_cifar100) {
// Skip the coarse label.
file->read(&label_char, 1);
}
file->read(&label_char, 1);
*label = label_char;
// Yes, there are better ways to do it, like in-place swap... but I am too
// lazy so let's just write it in a memory-wasteful way.
std::array<char, kCIFARImageNBytes> channel_first_storage;
file->read(channel_first_storage.data(), kCIFARImageNBytes);
for (int c = 0; c < 3; ++c) {
for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
buffer[i * 3 + c] =
channel_first_storage[c * kCIFARSize * kCIFARSize + i];
}
}
return;
}
void WriteToDB(const string& filename, const int num_items,
const int& offset, db::DB* db) {
TensorProtos protos;
TensorProto* data = protos.add_protos();
TensorProto* label = protos.add_protos();
data->set_data_type(TensorProto::BYTE);
data->add_dims(kCIFARSize);
data->add_dims(kCIFARSize);
data->add_dims(3);
label->set_data_type(TensorProto::INT32);
label->add_dims(1);
label->add_int32_data(0);
LOG(INFO) << "Converting file " << filename;
std::ifstream data_file(filename.c_str(),
std::ios::in | std::ios::binary);
CAFFE_ENFORCE(data_file, "Unable to open file ", filename);
char str_buffer[kCIFARImageNBytes];
int label_value;
string serialized_protos;
std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
for (int itemid = 0; itemid < num_items; ++itemid) {
ReadImage(&data_file, &label_value, str_buffer);
data->set_byte_data(str_buffer, kCIFARImageNBytes);
label->set_int32_data(0, label_value);
protos.SerializeToString(&serialized_protos);
snprintf(str_buffer, kCIFARImageNBytes, "%05d",
offset + itemid);
transaction->Put(string(str_buffer), serialized_protos);
}
}
void ConvertCIFAR() {
std::unique_ptr<db::DB> train_db(
db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_train_db_name,
db::NEW));
std::unique_ptr<db::DB> test_db(
db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_test_db_name,
db::NEW));
if (!caffe2::FLAGS_is_cifar100) {
// This is cifar 10.
for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
stringstream train_file;
train_file << caffe2::FLAGS_input_folder << "/data_batch_" << fileid + 1
<< ".bin";
WriteToDB(train_file.str(), kCIFAR10BatchSize,
fileid * kCIFAR10BatchSize, train_db.get());
}
stringstream test_file;
test_file << caffe2::FLAGS_input_folder << "/test_batch.bin";
WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
} else {
// This is cifar 100.
stringstream train_file;
train_file << caffe2::FLAGS_input_folder << "/train.bin";
WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
stringstream test_file;
test_file << caffe2::FLAGS_input_folder << "/test.bin";
WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
}
}
} // namespace caffe2
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::ConvertCIFAR();
return 0;
}

280
binaries/make_image_db.cc Normal file
View File

@ -0,0 +1,280 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This script converts an image dataset to a database.
//
// caffe2::FLAGS_input_folder is the root folder that holds all the images
//
// caffe2::FLAGS_list_file is the path to a file containing a list of files
// and their labels, as follows:
//
// subfolder1/file1.JPEG 7
// subfolder1/file2.JPEG 7
// subfolder2/file1.JPEG 8
// ...
//
#include <opencv2/opencv.hpp>
#include <algorithm>
#include <fstream>
#include <queue>
#include <random>
#include <string>
#include <thread>
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_bool(shuffle, false,
"Randomly shuffle the order of images and their labels");
CAFFE2_DEFINE_string(input_folder, "", "The input image file name.");
CAFFE2_DEFINE_string(
list_file,
"",
"The text file containing the list of images.");
CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
CAFFE2_DEFINE_bool(raw, false,
"If set, we pre-read the images and store the raw buffer.");
CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
CAFFE2_DEFINE_int(
scale,
256,
"If caffe2::FLAGS_raw is set, scale the shorter edge to the given value.");
CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
CAFFE2_DEFINE_int(
num_threads,
-1,
"Number of image parsing and conversion threads.");
namespace caffe2 {
class Converter {
public:
explicit Converter() {
data_ = protos_.add_protos();
label_ = protos_.add_protos();
if (caffe2::FLAGS_raw) {
data_->set_data_type(TensorProto::BYTE);
data_->add_dims(0);
data_->add_dims(0);
if (caffe2::FLAGS_color) {
data_->add_dims(3);
}
} else {
data_->set_data_type(TensorProto::STRING);
data_->add_dims(1);
data_->add_string_data("");
}
label_->set_data_type(TensorProto::INT32);
label_->add_dims(1);
label_->add_int32_data(0);
}
~Converter() {
if (thread_.joinable()) {
thread_.join();
}
}
void queue(const std::pair<std::string, int>& pair) {
in_.push(pair);
}
void start() {
thread_ = std::thread(&Converter::run, this);
}
std::string get() {
std::unique_lock<std::mutex> lock(mutex_);
while (out_.empty()) {
cv_.wait(lock);
}
auto value = out_.front();
out_.pop();
cv_.notify_one();
return value;
}
void run() {
const auto& input_folder = caffe2::FLAGS_input_folder;
std::unique_lock<std::mutex> lock(mutex_);
std::string value;
while (!in_.empty()) {
auto pair = in_.front();
in_.pop();
lock.unlock();
label_->set_int32_data(0, pair.second);
// Add raw file contents to DB if !raw
if (!caffe2::FLAGS_raw) {
std::ifstream image_file_stream(input_folder + pair.first);
if (!image_file_stream) {
LOG(ERROR) << "Cannot open " << input_folder << pair.first
<< ". Skipping.";
} else {
data_->mutable_string_data(0)->assign(
std::istreambuf_iterator<char>(image_file_stream),
std::istreambuf_iterator<char>());
}
} else {
// Load image
cv::Mat img = cv::imread(
input_folder + pair.first,
caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR
: CV_LOAD_IMAGE_GRAYSCALE);
// Resize image
cv::Mat resized_img;
int scaled_width, scaled_height;
if (caffe2::FLAGS_warp) {
scaled_width = caffe2::FLAGS_scale;
scaled_height = caffe2::FLAGS_scale;
} else if (img.rows > img.cols) {
scaled_width = caffe2::FLAGS_scale;
scaled_height =
static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
} else {
scaled_height = caffe2::FLAGS_scale;
scaled_width =
static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
}
cv::resize(
img,
resized_img,
cv::Size(scaled_width, scaled_height),
0,
0,
cv::INTER_LINEAR);
data_->set_dims(0, scaled_height);
data_->set_dims(1, scaled_width);
// Assert we don't have to deal with alignment
DCHECK(resized_img.isContinuous());
auto nbytes = resized_img.total() * resized_img.elemSize();
data_->set_byte_data(resized_img.ptr(), nbytes);
}
protos_.SerializeToString(&value);
// Add serialized proto to out queue or wait if it is not empty
lock.lock();
while (!out_.empty()) {
cv_.wait(lock);
}
out_.push(value);
cv_.notify_one();
}
}
protected:
TensorProtos protos_;
TensorProto* data_;
TensorProto* label_;
std::queue<std::pair<std::string, int>> in_;
std::queue<std::string> out_;
std::mutex mutex_;
std::condition_variable cv_;
std::thread thread_;
};
void ConvertImageDataset(
const string& input_folder,
const string& list_filename,
const string& output_db_name,
const bool /*shuffle*/) {
std::ifstream list_file(list_filename);
std::vector<std::pair<std::string, int> > lines;
std::string filename;
int file_label;
while (list_file >> filename >> file_label) {
lines.push_back(std::make_pair(filename, file_label));
}
if (caffe2::FLAGS_shuffle) {
LOG(INFO) << "Shuffling data";
std::shuffle(lines.begin(), lines.end(), std::default_random_engine(1701));
}
auto num_threads = caffe2::FLAGS_num_threads;
if (num_threads < 1) {
num_threads = std::thread::hardware_concurrency();
}
LOG(INFO) << "Processing " << lines.size() << " images...";
LOG(INFO) << "Opening DB " << output_db_name;
auto db = db::CreateDB(caffe2::FLAGS_db, output_db_name, db::NEW);
auto transaction = db->NewTransaction();
LOG(INFO) << "Using " << num_threads << " processing threads...";
std::vector<Converter> converters(num_threads);
// Queue entries across converters
for (auto i = 0; i < lines.size(); i++) {
converters[i % converters.size()].queue(lines[i]);
}
// Start all converters
for (auto& converter : converters) {
converter.start();
}
constexpr auto key_max_length = 256;
char key_cstr[key_max_length];
string value;
int count = 0;
for (auto i = 0; i < lines.size(); i++) {
// Get serialized proto for this entry
auto value = converters[i % converters.size()].get();
// Synthesize key for this entry
auto key_len = snprintf(
key_cstr, sizeof(key_cstr), "%08d_%s", i, lines[i].first.c_str());
DCHECK_LE(key_len, sizeof(key_cstr));
// Put in db
transaction->Put(string(key_cstr), value);
if (++count % 1000 == 0) {
// Commit the current writes.
transaction->Commit();
LOG(INFO) << "Processed " << count << " files.";
}
}
// Commit final transaction
transaction->Commit();
LOG(INFO) << "Processed " << count << " files.";
}
} // namespace caffe2
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::ConvertImageDataset(
caffe2::FLAGS_input_folder, caffe2::FLAGS_list_file,
caffe2::FLAGS_output_db_name, caffe2::FLAGS_shuffle);
return 0;
}

139
binaries/make_mnist_db.cc Normal file
View File

@ -0,0 +1,139 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This script converts the MNIST dataset to leveldb.
// The MNIST dataset could be downloaded at
// http://yann.lecun.com/exdb/mnist/
#include <fstream> // NOLINT(readability/streams)
#include <string>
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(image_file, "", "The input image file name.");
CAFFE2_DEFINE_string(label_file, "", "The label file name.");
CAFFE2_DEFINE_string(output_file, "", "The output db name.");
CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
CAFFE2_DEFINE_int(data_limit, -1,
"If set, only output this number of data points.");
CAFFE2_DEFINE_bool(channel_first, false,
"If set, write the data as channel-first (CHW order) as the old "
"Caffe does.");
namespace caffe2 {
uint32_t swap_endian(uint32_t val) {
val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
return (val << 16) | (val >> 16);
}
void convert_dataset(const char* image_filename, const char* label_filename,
const char* db_path, const int data_limit) {
// Open files
std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename);
CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename);
// Read the magic and the meta data
uint32_t magic;
uint32_t num_items;
uint32_t num_labels;
uint32_t rows;
uint32_t cols;
image_file.read(reinterpret_cast<char*>(&magic), 4);
magic = swap_endian(magic);
if (magic == 529205256) {
LOG(FATAL) <<
"It seems that you forgot to unzip the mnist dataset. You should "
"first unzip them using e.g. gunzip on Linux.";
}
CAFFE_ENFORCE_EQ(magic, 2051, "Incorrect image file magic.");
label_file.read(reinterpret_cast<char*>(&magic), 4);
magic = swap_endian(magic);
CAFFE_ENFORCE_EQ(magic, 2049, "Incorrect label file magic.");
image_file.read(reinterpret_cast<char*>(&num_items), 4);
num_items = swap_endian(num_items);
label_file.read(reinterpret_cast<char*>(&num_labels), 4);
num_labels = swap_endian(num_labels);
CAFFE_ENFORCE_EQ(num_items, num_labels);
image_file.read(reinterpret_cast<char*>(&rows), 4);
rows = swap_endian(rows);
image_file.read(reinterpret_cast<char*>(&cols), 4);
cols = swap_endian(cols);
// leveldb
std::unique_ptr<db::DB> mnist_db(db::CreateDB(caffe2::FLAGS_db, db_path, db::NEW));
std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
// Storing to db
char label_value;
std::vector<char> pixels(rows * cols);
int count = 0;
const int kMaxKeyLength = 10;
char key_cstr[kMaxKeyLength];
string value;
TensorProtos protos;
TensorProto* data = protos.add_protos();
TensorProto* label = protos.add_protos();
data->set_data_type(TensorProto::BYTE);
if (caffe2::FLAGS_channel_first) {
data->add_dims(1);
data->add_dims(rows);
data->add_dims(cols);
} else {
data->add_dims(rows);
data->add_dims(cols);
data->add_dims(1);
}
label->set_data_type(TensorProto::INT32);
label->add_int32_data(0);
LOG(INFO) << "A total of " << num_items << " items.";
LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
for (int item_id = 0; item_id < num_items; ++item_id) {
image_file.read(pixels.data(), rows * cols);
label_file.read(&label_value, 1);
for (int i = 0; i < rows * cols; ++i) {
data->set_byte_data(pixels.data(), rows * cols);
}
label->set_int32_data(0, static_cast<int>(label_value));
snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
protos.SerializeToString(&value);
string keystr(key_cstr);
// Put in db
transaction->Put(keystr, value);
if (++count % 1000 == 0) {
transaction->Commit();
}
if (data_limit > 0 && count == data_limit) {
LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
break;
}
}
}
} // namespace caffe2
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::convert_dataset(caffe2::FLAGS_image_file.c_str(), caffe2::FLAGS_label_file.c_str(),
caffe2::FLAGS_output_file.c_str(), caffe2::FLAGS_data_limit);
return 0;
}

View File

@ -0,0 +1,57 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/flags.h"
#include "caffe2/core/init.h"
#include "caffe2/core/predictor.h"
#include "caffe2/utils/proto_utils.h"
CAFFE2_DEFINE_string(init_net, "", "The given path to the init protobuffer.");
CAFFE2_DEFINE_string(
predict_net,
"",
"The given path to the predict protobuffer.");
namespace caffe2 {
void run() {
if (FLAGS_init_net.empty()) {
LOG(FATAL) << "No init net specified. Use --init_net=/path/to/net.";
}
if (FLAGS_predict_net.empty()) {
LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net.";
}
caffe2::NetDef init_net, predict_net;
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net));
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
// Can be large due to constant fills
VLOG(1) << "Init net: " << ProtoDebugString(init_net);
LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net);
auto predictor = caffe2::make_unique<Predictor>(init_net, predict_net);
LOG(INFO) << "Checking that a null forward-pass works";
Predictor::TensorVector inputVec, outputVec;
predictor->run(inputVec, &outputVec);
CAFFE_ENFORCE_GT(outputVec.size(), 0);
}
}
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::run();
// This is to allow us to use memory leak checks.
caffe2::ShutdownProtobufLibrary();
return 0;
}

View File

@ -0,0 +1,42 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/proto/caffe2.pb.h"
#define PRINT_SIZE(cls) \
std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \
<< std::endl;
int main(int /* unused */, char** /* unused */) {
PRINT_SIZE(caffe2::Blob);
PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
PRINT_SIZE(caffe2::CPUContext);
PRINT_SIZE(caffe2::CUDAContext);
PRINT_SIZE(caffe2::OperatorBase);
PRINT_SIZE(caffe2::OperatorDef);
PRINT_SIZE(caffe2::Operator<caffe2::CPUContext>);
PRINT_SIZE(caffe2::Operator<caffe2::CUDAContext>);
PRINT_SIZE(caffe2::TypeMeta);
PRINT_SIZE(caffe2::Workspace);
return 0;
}

View File

@ -0,0 +1,73 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <string>
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/operator_schema.h"
CAFFE2_DEFINE_string(schema, "",
"Print doc and schema of a particular operator");
static bool HasSchema(const std::string& str) {
return caffe2::OpSchemaRegistry::Schema(str);
}
static bool HasDoc(const std::string& str) {
const auto* schema = caffe2::OpSchemaRegistry::Schema(str);
return (schema != nullptr) && (schema->doc() != nullptr);
}
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
if (!caffe2::FLAGS_schema.empty()) {
const auto* schema = caffe2::OpSchemaRegistry::Schema(
caffe2::FLAGS_schema);
if (!schema) {
std::cerr << "Operator " << caffe2::FLAGS_schema
<< " doesn't have a schema" << std::endl;
return 1;
}
std::cout << "Operator " << caffe2::FLAGS_schema << ": " << std::endl
<< *schema;
return 0;
}
for (const auto& pair : *caffe2::gDeviceTypeRegistry()) {
std::cout << "Device type " << pair.first
#ifndef CAFFE2_USE_LITE_PROTO
<< " (" << caffe2::DeviceType_Name(
static_cast<caffe2::DeviceType>(pair.first))
<< ")"
#endif
<< std::endl;
for (const auto& key : pair.second->Keys()) {
std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key)
<< ")\t" << key << std::endl;
}
}
std::cout << "Operators that have gradients registered:" << std::endl;
for (const auto& key : caffe2::GradientRegistry()->Keys()) {
std::cout << "\t(schema: " << HasSchema(key) << ", doc: "
<< HasDoc(key) << ")\t"
<< key << std::endl;
}
return 0;
}

40
binaries/run_plan.cc Normal file
View File

@ -0,0 +1,40 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
if (caffe2::FLAGS_plan.size() == 0) {
LOG(ERROR) << "No plan specified. Use --plan=/path/to/plan.";
return 0;
}
LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
caffe2::PlanDef plan_def;
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
workspace->RunPlan(plan_def);
// This is to allow us to use memory leak checks.
caffe2::ShutdownProtobufLibrary();
return 0;
}

48
binaries/run_plan_mpi.cc Normal file
View File

@ -0,0 +1,48 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <mpi.h>
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
int main(int argc, char** argv) {
caffe2::SetUsageMessage("Runs a caffe2 plan that has MPI operators in it.");
int mpi_ret;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
if (mpi_ret != MPI_THREAD_MULTIPLE &&
mpi_ret != MPI_THREAD_SERIALIZED) {
std::cerr << "Caffe2 MPI requires the underlying MPI to support the "
"MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.\n";
return 1;
}
caffe2::GlobalInit(&argc, &argv);
LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
caffe2::PlanDef plan_def;
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
workspace->RunPlan(plan_def);
// This is to allow us to use memory leak checks.
caffe2::ShutdownProtobufLibrary();
MPI_Finalize();
return 0;
}

193
binaries/speed_benchmark.cc Normal file
View File

@ -0,0 +1,193 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/utils/string_utils.h"
CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
CAFFE2_DEFINE_string(
init_net,
"",
"The given net to initialize any parameters.");
CAFFE2_DEFINE_string(
input,
"",
"Input that is needed for running the network. If "
"multiple input needed, use comma separated string.");
CAFFE2_DEFINE_string(
input_file,
"",
"Input file that contain the serialized protobuf for "
"the input blobs. If multiple input needed, use comma "
"separated string. Must have the same number of items "
"as input does.");
CAFFE2_DEFINE_string(
input_dims,
"",
"Alternate to input_files, if all inputs are simple "
"float TensorCPUs, specify the dimension using comma "
"separated numbers. If multiple input needed, use "
"semicolon to separate the dimension of different "
"tensors.");
CAFFE2_DEFINE_string(
input_type,
"", "Input type (uint8_t/float)");
CAFFE2_DEFINE_string(
output,
"",
"Output that should be dumped after the execution "
"finishes. If multiple outputs are needed, use comma "
"separated string. If you want to dump everything, pass "
"'*' as the output value.");
CAFFE2_DEFINE_string(
output_folder,
"",
"The folder that the output should be written to. This "
"folder must already exist in the file system.");
CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
CAFFE2_DEFINE_bool(
run_individual,
false,
"Whether to benchmark individual operators.");
CAFFE2_DEFINE_bool(force_engine, false, "Force engine field for all operators");
CAFFE2_DEFINE_string(engine, "", "Forced engine field value");
CAFFE2_DEFINE_bool(force_algo, false, "Force algo arg for all operators");
CAFFE2_DEFINE_string(algo, "", "Forced algo arg value");
using std::string;
using std::unique_ptr;
using std::vector;
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
// Run initialization network.
caffe2::NetDef net_def;
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
CAFFE_ENFORCE(workspace->RunNetOnce(net_def));
// Load input.
if (caffe2::FLAGS_input.size()) {
vector<string> input_names = caffe2::split(',', caffe2::FLAGS_input);
if (caffe2::FLAGS_input_file.size()) {
vector<string> input_files = caffe2::split(',', caffe2::FLAGS_input_file);
CAFFE_ENFORCE_EQ(
input_names.size(),
input_files.size(),
"Input name and file should have the same number.");
for (int i = 0; i < input_names.size(); ++i) {
caffe2::BlobProto blob_proto;
CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
workspace->CreateBlob(input_names[i])->Deserialize(blob_proto);
}
} else if (caffe2::FLAGS_input_dims.size() || caffe2::FLAGS_input_type.size()) {
CAFFE_ENFORCE_NE(0, caffe2::FLAGS_input_dims.size(),
"Input dims must be specified when input tensors are used.");
CAFFE_ENFORCE_NE(0, caffe2::FLAGS_input_type.size(),
"Input type must be specified when input tensors are used.");
vector<string> input_dims_list =
caffe2::split(';', caffe2::FLAGS_input_dims);
CAFFE_ENFORCE_EQ(
input_names.size(),
input_dims_list.size(),
"Input name and dims should have the same number of items.");
vector<string> input_type_list =
caffe2::split(';', caffe2::FLAGS_input_type);
CAFFE_ENFORCE_EQ(
input_names.size(),
input_type_list.size(),
"Input name and type should have the same number of items.");
for (size_t i = 0; i < input_names.size(); ++i) {
vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
vector<int> input_dims;
for (const string& s : input_dims_str) {
input_dims.push_back(caffe2::stoi(s));
}
caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
if (blob == nullptr) {
blob = workspace->CreateBlob(input_names[i]);
}
caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
CHECK_NOTNULL(tensor);
tensor->Resize(input_dims);
if (input_type_list[i] == "uint8_t") {
tensor->mutable_data<uint8_t>();
} else if (input_type_list[i] == "float") {
tensor->mutable_data<float>();
} else {
CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
}
}
} else {
CAFFE_THROW(
"You requested input tensors, but neither input_file nor "
"input_dims is set.");
}
}
// Run main network.
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
// force changing engine and algo
if (caffe2::FLAGS_force_engine) {
LOG(INFO) << "force engine be: " << caffe2::FLAGS_engine;
for (const auto& op : net_def.op()) {
const_cast<caffe2::OperatorDef*>(&op)->set_engine(caffe2::FLAGS_engine);
}
}
if (caffe2::FLAGS_force_algo) {
LOG(INFO) << "force algo be: " << caffe2::FLAGS_algo;
for (const auto& op : net_def.op()) {
caffe2::GetMutableArgument(
"algo", true, const_cast<caffe2::OperatorDef*>(&op))
->set_s(caffe2::FLAGS_algo);
}
}
caffe2::NetBase* net = workspace->CreateNet(net_def);
CHECK_NOTNULL(net);
net->TEST_Benchmark(
caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
string output_prefix = caffe2::FLAGS_output_folder.size()
? caffe2::FLAGS_output_folder + "/"
: "";
if (caffe2::FLAGS_output.size()) {
vector<string> output_names = caffe2::split(',', caffe2::FLAGS_output);
if (caffe2::FLAGS_output == "*") {
output_names = workspace->Blobs();
}
for (const string& name : output_names) {
CAFFE_ENFORCE(
workspace->HasBlob(name),
"You requested a non-existing blob: ",
name);
string serialized = workspace->GetBlob(name)->Serialize(name);
string output_filename = output_prefix + name;
caffe2::WriteStringToFile(serialized, output_filename.c_str());
}
}
return 0;
}

77
binaries/split_db.cc Normal file
View File

@ -0,0 +1,77 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <sstream>
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/logging.h"
CAFFE2_DEFINE_string(input_db, "", "The input db.");
CAFFE2_DEFINE_int(splits, 0, "The number of splits.");
CAFFE2_DEFINE_string(db_type, "", "The db type.");
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
namespace caffe2 {
static int Split(int argc, char** argv) {
GlobalInit(&argc, &argv);
CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");
unique_ptr<db::DB> in_db(
db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
unique_ptr<db::Cursor> cursor(in_db->NewCursor());
// This usually won't happen, but FWIW.
CAFFE_ENFORCE(
cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
vector<unique_ptr<db::DB>> out_dbs;
vector<unique_ptr<db::Transaction>> transactions;
for (int i = 0; i < FLAGS_splits; ++i) {
out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
transactions.push_back(
unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
CAFFE_ENFORCE(
transactions.back().get(), "Cannot get transaction for output db #", i);
}
int count = 0;
for (; cursor->Valid(); cursor->Next()) {
transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
if (++count % FLAGS_batch_size == 0) {
for (int i = 0; i < FLAGS_splits; ++i) {
transactions[i]->Commit();
}
LOG(INFO) << "Split " << count << " items so far.";
}
}
LOG(INFO) << "A total of " << count << " items processed.";
return 0;
}
} // namespace caffe2
int main(int argc, char** argv) {
return caffe2::Split(argc, argv);
}

89
binaries/tutorial_blob.cc Normal file
View File

@ -0,0 +1,89 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/blob.h"
#include "caffe2/core/init.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/logging.h"
// We will be lazy and just use the whole namespace.
using namespace caffe2;
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
caffe2::ShowLogInfoToStderr();
LOG(INFO) <<
"This script corresponds to the Blob part of the Caffe2 C++ "
"tutorial.";
LOG(INFO) << "Let's create a blob myblob.";
Blob myblob;
LOG(INFO) << "Let's set it to int and set the value to 10.";
int* myint = myblob.GetMutable<int>();
*myint = 10;
LOG(INFO)
<< "Is the blob type int? "
<< myblob.IsType<int>();
LOG(INFO)
<< "Is the blob type float? "
<< myblob.IsType<float>();
const int& myint_const = myblob.Get<int>();
LOG(INFO)
<< "The value of the int number stored in the blob is: "
<< myint_const;
LOG(INFO)
<< "Let's try to get a float pointer. This will trigger an exception.";
try {
const float& myfloat = myblob.Get<float>();
LOG(FATAL) << "This line should never happen.";
} catch (std::exception& e) {
LOG(INFO)
<< "As expected, we got an exception. Its content says: "
<< e.what();
}
LOG(INFO) <<
"However, we can change the content type (and destroy the old "
"content) by calling GetMutable. Let's change it to double.";
double* mydouble = myblob.GetMutable<double>();
*mydouble = 3.14;
LOG(INFO) << "The new content is: " << myblob.Get<double>();
LOG(INFO) <<
"If we have a pre-created object, we can use Reset() to transfer the "
"object to a blob.";
std::string* pvec = new std::string();
myblob.Reset(pvec); // no need to release pvec, myblob takes ownership.
LOG(INFO) << "Is the blob now of type string? "
<< myblob.IsType<std::string>();
LOG(INFO) << "This concludes the blob tutorial.";
return 0;
}

66
binaries/zmq_feeder.cc Normal file
View File

@ -0,0 +1,66 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This binary provides an easy way to open a zeromq server and feeds data to
// clients connect to it. It uses the Caffe2 db as the backend, thus allowing
// one to convert any db-compliant storage to a zeromq service.
#include "caffe2/core/db.h"
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
#include "caffe2/utils/zmq_helper.h"
CAFFE2_DEFINE_string(server, "tcp://*:5555", "The server address.");
CAFFE2_DEFINE_string(input_db, "", "The input db.");
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
using caffe2::db::DB;
using caffe2::db::Cursor;
using caffe2::string;
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
LOG(INFO) << "Opening DB...";
auto in_db = caffe2::db::CreateDB(
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ);
CAFFE_ENFORCE(
in_db,
"Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " +
caffe2::FLAGS_input_db_type);
auto cursor = in_db->NewCursor();
LOG(INFO) << "DB opened.";
LOG(INFO) << "Starting ZeroMQ server...";
// Socket to talk to clients
caffe2::ZmqSocket sender(ZMQ_PUSH);
sender.Bind(caffe2::FLAGS_server);
LOG(INFO) << "Server created at " << caffe2::FLAGS_server;
while (1) {
VLOG(1) << "Sending " << cursor->key();
sender.SendTillSuccess(cursor->key(), ZMQ_SNDMORE);
sender.SendTillSuccess(cursor->value(), 0);
cursor->Next();
if (!cursor->Valid()) {
cursor->SeekToFirst();
}
}
// We do not do an elegant quit since this binary is going to be terminated by
// control+C.
return 0;
}

0
caffe/__init__.py Normal file
View File

View File

@ -0,0 +1,17 @@
file(GLOB Caffe_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
caffe2_protobuf_generate_cpp_py(Caffe_PROTO_SRCS Caffe_PROTO_HEADERS Caffe_PROTO_PY ${Caffe_PROTOBUF_FILES})
add_library(Caffe_PROTO OBJECT ${Caffe_PROTO_HEADERS} ${Caffe_PROTO_SRCS})
if (MSVC)
if(BUILD_SHARED_LIBS)
set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)")
else()
set(Caffe2_API_DEFINE "-DCAFFE2_API=")
endif()
target_compile_definitions(
Caffe_PROTO PRIVATE ${Caffe2_API_DEFINE})
endif()
install(FILES ${Caffe_PROTO_HEADERS} DESTINATION include/caffe/proto)

0
caffe/proto/__init__.py Normal file
View File

1399
caffe/proto/caffe.proto Normal file

File diff suppressed because it is too large Load Diff

87
caffe2/.clang-format Normal file
View File

@ -0,0 +1,87 @@
---
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands: false
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ]
IncludeCategories:
- Regex: '^<.*\.h(pp)?>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never
...

310
caffe2/CMakeLists.txt Normal file
View File

@ -0,0 +1,310 @@
# ---[ Declare source file lists
# ---[ Add respective subdirectories
# Note: the folders that are being commented out have not been properly
# addressed yet.
add_subdirectory(proto)
add_subdirectory(contrib)
add_subdirectory(core)
add_subdirectory(core/nomnigraph)
add_subdirectory(cuda_rtc)
add_subdirectory(db)
add_subdirectory(distributed)
# add_subdirectory(experiments) # note, we may remove this folder at some point
add_subdirectory(image)
add_subdirectory(video)
add_subdirectory(mkl)
add_subdirectory(mobile)
add_subdirectory(mpi)
add_subdirectory(observers)
add_subdirectory(onnx)
add_subdirectory(operators)
add_subdirectory(operators/rnn)
add_subdirectory(perfkernels)
add_subdirectory(python)
add_subdirectory(queue)
add_subdirectory(sgd)
add_subdirectory(share)
# add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
add_subdirectory(transforms)
add_subdirectory(utils)
# Advanced: if we have white list specified, we will do intersections for all
# main lib srcs.
if (CAFFE2_WHITELISTED_FILES)
caffe2_do_whitelist(Caffe2_CPU_SRCS CAFFE2_WHITELISTED_FILES)
caffe2_do_whitelist(Caffe2_GPU_SRCS CAFFE2_WHITELISTED_FILES)
endif()
# Debug messages - if you want to get a list of source files, enable the
# following.
if (FALSE)
message(STATUS "CPU sources: ")
foreach(tmp ${Caffe2_CPU_SRCS})
message(STATUS " " ${tmp})
endforeach()
message(STATUS "GPU sources: ")
foreach(tmp ${Caffe2_GPU_SRCS})
message(STATUS " " ${tmp})
endforeach()
message(STATUS "CPU test sources: ")
foreach(tmp ${Caffe2_CPU_TEST_SRCS})
message(STATUS " " ${tmp})
endforeach()
message(STATUS "GPU test sources: ")
foreach(tmp ${Caffe2_GPU_TEST_SRCS})
message(STATUS " " ${tmp})
endforeach()
endif()
# ---[ Generate and install header files.
# Write the macros file.
configure_file(
${PROJECT_SOURCE_DIR}/caffe2/core/macros.h.in
${PROJECT_BINARY_DIR}/caffe2/core/macros.h)
# Installing the header files
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
DESTINATION include
FILES_MATCHING PATTERN "*.h")
install(FILES ${PROJECT_BINARY_DIR}/caffe2/core/macros.h
DESTINATION include/caffe2/core)
# ---[ List of libraries to link with
add_library(caffe2_protos STATIC $<TARGET_OBJECTS:Caffe_PROTO> $<TARGET_OBJECTS:Caffe2_PROTO>)
add_dependencies(caffe2_protos Caffe_PROTO Caffe2_PROTO)
# If we are going to link protobuf locally inside caffe2 libraries, what we will do is
# to create a helper static library that always contains libprotobuf source files, and
# link the caffe2 related dependent libraries to it.
target_include_directories(caffe2_protos INTERFACE $<INSTALL_INTERFACE:include>)
# Reason for this public dependency is as follows:
# (1) Strictly speaking, we should not expose any Protobuf related functions. We should
# only use function interfaces wrapped with our own public API, and link protobuf
# locally.
# (2) However, currently across the Caffe2 codebase, we have extensive use of protobuf
# functionalities. For example, not only libcaffe2.so uses it, but also other
# binaries such as python extensions etc. As a result, we will have to have a
# transitive dependency to libprotobuf.
#
# Good thing is that, if we specify CAFFE2_LINK_LOCAL_PROTOBUF, then we do not need to
# separately deploy protobuf binaries - libcaffe2.so will contain all functionalities
# one needs. One can verify this via ldd.
#
# TODO item in the future includes:
# (1) Enable using lite protobuf
# (2) Properly define public API that do not directly depend on protobuf itself.
# (3) Expose the libprotobuf.a file for dependent libraries to link to.
#
# What it means for users/developers?
# (1) Users: nothing affecting the users, other than the fact that CAFFE2_LINK_LOCAL_PROTOBUF
# avoids the need to deploy protobuf.
# (2) Developers: if one simply uses core caffe2 functionality without using protobuf,
# nothing changes. If one has a dependent library that uses protobuf, then one needs to
# have the right protobuf version as well as linking to libprotobuf.a.
target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf)
# Compile exposed libraries.
add_library(caffe2 ${Caffe2_CPU_SRCS})
caffe2_interface_library(caffe2_protos caffe2_protos_whole)
target_link_libraries(caffe2 PRIVATE caffe2_protos_whole)
if (${CAFFE2_LINK_LOCAL_PROTOBUF})
target_link_libraries(caffe2 INTERFACE protobuf::libprotobuf)
else()
target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
endif()
target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
target_include_directories(caffe2 INTERFACE $<INSTALL_INTERFACE:include>)
target_compile_options(caffe2 INTERFACE "-std=c++11")
target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
# Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
caffe2_interface_library(caffe2 caffe2_library)
list(APPEND Caffe2_MAIN_LIBS caffe2_library)
# ---[ CUDA library.
if(USE_CUDA)
# A hack to deal with cuda library dependencies and modern CMake: the
# CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
# one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
# hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with
# it. We will then manually add the cudart library as interface libs.
set(__tmp ${CUDA_LIBRARIES})
set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
CUDA_ADD_LIBRARY(caffe2_gpu ${Caffe2_GPU_SRCS})
set(CUDA_LIBRARIES ${__tmp})
target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart)
target_include_directories(
caffe2_gpu INTERFACE $<INSTALL_INTERFACE:include>)
target_link_libraries(
caffe2_gpu PUBLIC caffe2 ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
target_link_libraries(
caffe2_gpu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
caffe2_interface_library(caffe2_gpu caffe2_gpu_library)
list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library)
install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib)
endif()
# ---[ Test binaries.
if (BUILD_TEST)
set(Caffe2_ALL_TEST_SRCS ${Caffe2_CPU_TEST_SRCS})
if (USE_CUDA)
list(APPEND Caffe2_ALL_TEST_SRCS ${Caffe2_GPU_TEST_SRCS})
endif()
foreach(test_src ${Caffe2_ALL_TEST_SRCS})
get_filename_component(test_name ${test_src} NAME_WE)
add_executable(${test_name} "${test_src}")
# For tests, some of the test code actually directly call the dependent
# libraries even if they are not part of the public dependency libs. As a
# result, we will explicitly link the test against the Caffe2 dependency
# libs.
target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
if (USE_CUDA)
target_link_libraries(${test_name} ${Caffe2_CUDA_DEPENDENCY_LIBS})
endif()
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
target_compile_features(${test_name} PRIVATE cxx_range_for)
endif()
add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
install(TARGETS ${test_name} DESTINATION test)
endforeach()
endif()
if (BUILD_PYTHON)
# Python site-packages
# Get canonical directory for python site packages (relative to install
# location). It varys from system to system.
pycmd(PYTHON_SITE_PACKAGES "
from distutils import sysconfig
print(sysconfig.get_python_lib(prefix=''))
")
# ---[ Options.
SET(PYTHON_LIB_REL_PATH "${PYTHON_SITE_PACKAGES}" CACHE STRING "Python installation path (relative to CMake installation prefix)")
message(STATUS "Using ${PYTHON_LIB_REL_PATH} as python relative installation path")
# Python extension suffix
# Try to get from python through sysconfig.get_env_var('EXT_SUFFIX') first,
# fallback to ".pyd" if windows and ".so" for all others.
pycmd(PY_EXT_SUFFIX "
from distutils import sysconfig
ext_suffix = sysconfig.get_config_var('EXT_SUFFIX')
print(ext_suffix if ext_suffix else '')
")
if("${PY_EXT_SUFFIX}" STREQUAL "")
if (MSVC)
set(PY_EXT_SUFFIX ".pyd")
else()
set(PY_EXT_SUFFIX ".so")
endif()
endif()
# ---[ Python.
add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "")
set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
if (APPLE)
set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
endif()
set_target_properties(
caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR}/caffe2/python)
target_link_libraries(
caffe2_pybind11_state caffe2_library)
install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
if(USE_CUDA)
add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS})
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "")
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
if (APPLE)
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
endif()
set_target_properties(
caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR}/caffe2/python)
target_link_libraries(
caffe2_pybind11_state_gpu caffe2_library caffe2_gpu_library)
install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
endif()
if (MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio")
# If we are building under windows, we will copy the file from
# build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd
# to its parent folder so that we can do in-build execution.
add_custom_target(windows_python_copy_lib ALL)
add_dependencies(windows_python_copy_lib caffe2_pybind11_state)
add_custom_command(
TARGET windows_python_copy_lib POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:caffe2_pybind11_state>
${CMAKE_BINARY_DIR}/caffe2/python)
if (USE_CUDA)
add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu)
add_custom_command(
TARGET windows_python_copy_lib POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:caffe2_pybind11_state_gpu>
${CMAKE_BINARY_DIR}/caffe2/python)
endif()
endif()
# Finally, Copy all python files to build directory
# Generate and create all needed __init__.py files, if they aren't already
# present in the current source tree.
message(STATUS "Automatically generating missing __init__.py files.")
caffe_autogen_init_py_files()
# Create a custom target that copies all python files.
file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR}
"${PROJECT_SOURCE_DIR}/caffe2/*.py")
add_custom_target(python_copy_files ALL)
if(MSVC OR CMAKE_GENERATOR MATCHES "Ninja")
# ninja fails when the command line is too long so we split
# the target into several. This would be beneficial for VS also
# since it build targets in parallel but not custom commands
foreach(python_src ${PYTHON_SRCS})
get_filename_component(dir ${python_src} DIRECTORY)
string(SHA1 name_hash "${python_src}")
# get_filename_component(name_we ${python_src} NAME_WE)
add_custom_target(python_copy_files_${name_hash}
COMMAND ${CMAKE_COMMAND} -E copy
${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
add_dependencies(python_copy_files python_copy_files_${name_hash})
endforeach()
else()
foreach(python_src ${PYTHON_SRCS})
get_filename_component(dir ${python_src} DIRECTORY)
add_custom_command(
TARGET python_copy_files PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
endforeach()
endif()
# Install commands
# Pick up static python files
install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
FILES_MATCHING PATTERN "*.py")
# Caffe proto files
install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe DESTINATION ${PYTHON_LIB_REL_PATH}
FILES_MATCHING PATTERN "*.py")
# Caffe2 proto files
install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
FILES_MATCHING PATTERN "*.py")
endif()
# Finally, set the Caffe2_MAIN_LIBS variable in the parent scope.
set(Caffe2_MAIN_LIBS ${Caffe2_MAIN_LIBS} PARENT_SCOPE)

21
caffe2/README.md Normal file
View File

@ -0,0 +1,21 @@
# Caffe2
[![Jenkins Build Status](https://ci.pytorch.org/jenkins/job/caffe2-master/lastCompletedBuild/badge/icon)](https://ci.pytorch.org/jenkins/job/caffe2-master)
Caffe2 is a lightweight, modular, and scalable deep learning framework. Building on the original [Caffe](http://caffe.berkeleyvision.org), Caffe2 is designed with expression, speed, and modularity in mind.
## Questions and Feedback
Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features.
### Further Resources on [Caffe2.ai](http://caffe2.ai)
* [Installation](http://caffe2.ai/docs/getting-started.html)
* [Learn More](http://caffe2.ai/docs/learn-more.html)
* [Upgrading to Caffe2](http://caffe2.ai/docs/caffe-migration.html)
* [Datasets](http://caffe2.ai/docs/datasets.html)
* [Model Zoo](http://caffe2.ai/docs/zoo.html)
* [Tutorials](http://caffe2.ai/docs/tutorials.html)
* [Operators Catalogue](http://caffe2.ai/docs/operators-catalogue.html)
* [C++ API](http://caffe2.ai/doxygen-c/html/classes.html)
* [Python API](http://caffe2.ai/doxygen-python/html/namespaces.html)

1
caffe2/VERSION_NUMBER Normal file
View File

@ -0,0 +1 @@
0.8.2

0
caffe2/__init__.py Normal file
View File

View File

@ -0,0 +1,17 @@
add_subdirectory(aten)
add_subdirectory(gloo)
add_subdirectory(nccl)
add_subdirectory(prof)
add_subdirectory(shm_mutex)
add_subdirectory(script)
# Finally pass the src lists back to the parent
# CPU source, test sources, binary sources
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
# GPU source, test sources, binary sources
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)

View File

View File

@ -0,0 +1,29 @@
if(USE_ATEN)
if(NOT USE_CUDA)
set(NO_CUDA ON)
endif()
set(TORCH_CUDA_ARCH_LIST "3.5 5.2 6.0 6.1+PTX")
set(TORCH_NVCC_FLAGS "-Xfatbin -compress-all")
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(AT_LINK_STYLE STATIC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/aten aten EXCLUDE_FROM_ALL)
add_custom_command(OUTPUT aten_op.h
COMMAND
python ${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py
--third_party_root=${PROJECT_SOURCE_DIR}/third_party
--template_dir=${PROJECT_SOURCE_DIR}/caffe2/contrib/aten
DEPENDS
ATen
${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py
${CMAKE_CURRENT_SOURCE_DIR}/aten_op_template.h)
add_custom_target(__aten_op_header_gen DEPENDS aten_op.h)
add_library(aten_op_header_gen INTERFACE)
add_dependencies(aten_op_header_gen __aten_op_header_gen)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc" PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc" PARENT_SCOPE)
endif()

View File

@ -0,0 +1,80 @@
# An ATen operator for Caffe2
[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch
and PyTorch directly in C++11. This library provides a generated wrapper around the ATen API
that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
ToffeeIR.
### Example Usage in Caffe2
First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
We will call the `pow` operator:
```
static inline Tensor pow(const Tensor & self, Scalar exponent);
```
Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`,
and there is always a string attribute `operator` that defines which ATen function to call:
```
import numpy as np
from caffe2.python import core, workspace
# create the Caffe2 Op:
op = core.CreateOperator(
"ATen",
["MyInput"],
["MyOutput"],
operator="pow", exponent=2.0)
```
Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob.
Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes.
In the case of `Scalar` the attributes can be either an integers or floating point numbers.
The op can now be run like any other Caffe2 operator:
```
workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32))
workspace.RunOperatorOnce(op)
print(workspace.FetchBlob("MyOutput")
```
For methods, the first input is always the `this` Tensor in C++.
To call methods of ATen's `Type` objects, you provide an additional string attribute
that determines the type:
```
# create a 2x4 tensor filled with floating point ones
op = core.CreateOperator(
"ATen",
[],
["MyOutput"],
operator="ones", type="Float", size={2,4})
```
Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
### Example Usage via PyTorch Symbolic
The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported
to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API:
```
class Add(torch.autograd.Function):
@staticmethod
def symbolic(g, a, b):
return g.op("ATen", a, b, operator_s = "add")
@staticmethod
def forward(ctx, a, b):
return a + b
```

View File

@ -0,0 +1,22 @@
#include "caffe2/contrib/aten/aten_op.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
template<>
at::Backend ATenOp<CPUContext>::backend() const {
return at::kCPU;
}
OPERATOR_SCHEMA(ATen);
CAFFE_KNOWN_TYPE(at::Half);
namespace math {
template<>
void Set<at::Half,CPUContext>(const size_t N, const at::Half h, at::Half* v, CPUContext * c) {
Set(0, h.x, (uint16_t*) v, c);
}
}
}

View File

@ -0,0 +1 @@
#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"

View File

@ -0,0 +1,19 @@
#include "caffe2/contrib/aten/aten_op.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
template<>
at::Backend ATenOp<CUDAContext>::backend() const {
return at::kCUDA;
}
namespace math {
template<>
void Set<at::Half,CUDAContext>(const size_t N, const at::Half h, at::Half* v, CUDAContext * c) {
Set(0, h.x, (uint16_t*) v, c);
}
}
}

View File

@ -0,0 +1,218 @@
#pragma once
#include <unordered_map>
#include <string>
#include <ATen/ATen.h>
#include <caffe2/core/context.h>
#include <caffe2/core/operator.h>
#include <caffe2/utils/math.h>
#include <iostream>
// a map from descriptor strings (see [DESCRIPTORS])
// to the key in the switch statement that implements them
static std::unordered_map<std::string, int> op_to_key = {
${mappings}
};
namespace caffe2 {
using at::Half; // for AT_FORALL_SCALAR_TYPES
template <class Context>
class ATenOp : public Operator<Context> {
public:
ATenOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {
VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n";
switch(findImplementation(operator_def)) {
${implementations}
default:
CAFFE_THROW("Unexpected key value for aten operator");
}
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
return run_op();
}
private:
// actual operator implementation is initialized in ctor.
std::function<bool()> run_op;
at::Backend backend() const;
TypeMeta typeMetaFor(const at::Tensor & t) {
return typeMetaFor(t.type().scalarType());
}
TypeMeta typeMetaFor(at::ScalarType st) {
#define DEFINE_CASE(ctype,aten_name,_) \
case at::k##aten_name: \
return TypeMeta::Make<ctype>();
switch(st) {
AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
default:
CAFFE_THROW("Unknown ATen Type");
}
#undef DEFINE_CASE
}
at::Type & typeFor(const Tensor<Context> & ten) {
return at::getType(backend(), atScalarTypeFor(ten.meta()));
}
at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
auto& ten = const_cast<Tensor<Context>&>(ten_);
return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
}
at::Tensor loadInput(size_t i) {
return tensorWrapping(Input(i));
}
std::vector<at::Tensor> loadInputsAtOffset(size_t s) {
std::vector<at::Tensor> results;
for (size_t i = s; i < InputSize(); i++) {
results.push_back(loadInput(i));
}
return results;
}
at::ScalarType atScalarTypeFor(const TypeMeta & meta) {
#define DEFINE_IF(ctype,aten_name,_) \
if(meta.Match<ctype>()) { \
return at::k##aten_name; \
}
AT_FORALL_SCALAR_TYPES(DEFINE_IF)
#undef DEFINE_IF
CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
}
void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
at::Tensor src = src_.contiguous();
auto at_sizes = src.sizes();
std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
dst->Resize(dims);
dst->ShareExternalPointer(
src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable {
// return a closure that holds a handle to t until it is called
// to keep the aten memory alive
return src.reset();
});
}
void assignListStartingAt(
size_t offset,
const std::vector<at::Tensor>& tensors) {
for (size_t i = 0; i < tensors.size(); i++) {
assignTo(Output(offset + i), tensors[i]);
}
}
// the AT_FORALL_SCALAR_TYPES macro just gives a 'i' or 'd' argument
// for each type to specify if it is stored as a integer or a double.
// We need this workaround here to extract the value in the scalar losslessly
// because in some cases like 'sum' Torch promotes float to double
// and will complain if we downcast it with toFloat, causing it
// to lose precision
double extract_d(const at::Scalar & s) {
return s.toDouble();
}
int64_t extract_i(const at::Scalar & s) {
return s.toLong();
}
void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
switch(inferred_type.scalarType()) {
#define DEFINE_CASE(ctype,aten_name,native) \
case at::k##aten_name: { \
auto value = extract_##native(scalar); \
assignToValue<ctype>(dst, at::convert<ctype,decltype(value)>(value)); \
} break;
AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
#undef DEFINE_CASE
default:
CAFFE_THROW("Unknown ATen Type");
}
}
template<typename T>
void assignToValue(Tensor<Context> * dst, T v) {
dst->Resize(std::vector<TIndex>());
math::Set(1, v, dst->template mutable_data<T>(), &context_);
}
int findImplementation(const OperatorDef& operator_def) {
CAFFE_ENFORCE(HasArgument("operator"));
std::string op = OperatorBase::GetSingleArgument<std::string>("operator", "");
// construct descriptor string ([DESCRIPTORS]) given the attributes
// and inputs of this operator_def, and look up the implementation key
// for this variant
std::stringstream descriptor;
descriptor << op;
std::vector<std::string> attrs;
for(size_t i = 0; i < operator_def.arg_size(); i++) {
auto & attr = operator_def.arg(i);
if(attr.name() == "operator" || attr.name() == "type" )
continue;
attrs.push_back(attr.name());
}
std::sort(attrs.begin(), attrs.end());
for(auto & a : attrs)
descriptor << "-" << a;
std::string descriptor_sized =
descriptor.str() + "-" + caffe2::to_string(InputSize());
std::string descriptor_var_args = descriptor.str() + "-*";
if (op_to_key.count(descriptor_sized) > 0) {
return op_to_key[descriptor_sized];
}
if (op_to_key.count(descriptor_var_args) > 0) {
return op_to_key[descriptor_var_args];
}
std::stringstream ss;
ss << "Attempting to run unknown ATen operator configuration: "
<< descriptor_sized;
CAFFE_THROW(ss.str());
}
at::Scalar readScalarAttribute(const std::string & name) {
if(OperatorBase::HasSingleArgumentOfType<int64_t>(name)) {
return OperatorBase::GetSingleArgument<int64_t>(name, 0);
} else {
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<float>(name));
return OperatorBase::GetSingleArgument<float>(name, 0);
}
}
template<typename T>
T readAttribute(const std::string & name) {
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>(name));
return OperatorBase::GetSingleArgument<T>(name, 0);
}
std::vector<int64_t> readIntList(const std::string & name) {
CAFFE_ENFORCE(OperatorBase::HasArgument(name));
return OperatorBase::GetRepeatedArgument<int64_t>(name, {});
}
template <int N>
std::array<bool, N> readBoolMask(const std::string& name) {
CAFFE_ENFORCE(OperatorBase::HasArgument(name));
std::vector<int64_t> ints =
OperatorBase::GetRepeatedArgument<int64_t>(name, {});
std::array<bool, N> result;
for (size_t i = 0; i < N; ++i) {
result[i] = ints.at(i);
}
return result;
}
at::ScalarType stringToScalarType(const std::string & name) {
#define DEFINE_IF(type,aten) \
if(#type == name) \
return at::k##aten;
DEFINE_IF(float16, Half)
DEFINE_IF(float, Float)
DEFINE_IF(double, Double)
DEFINE_IF(uint8, Byte)
DEFINE_IF(int8, Char)
DEFINE_IF(int16, Short)
DEFINE_IF(int32, Int)
DEFINE_IF(int64, Long)
CAFFE_THROW("unsupported type annotation: ", name);
}
at::Type & stringToType(const std::string & name) {
return at::getType(backend(), stringToScalarType(name));
}
at::Type * readTypeAttribute(const std::string & name) {
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<std::string>(name));
return &stringToType(OperatorBase::GetSingleArgument<std::string>(name, ""));
}
};
}

View File

@ -0,0 +1,86 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, dyndep
from hypothesis import given
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
import numpy as np
dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/aten:aten_op')
class TestATen(hu.HypothesisTestCase):
@given(inputs=hu.tensors(n=2), **hu.gcs)
def test_add(self, inputs, gc, dc):
op = core.CreateOperator(
"ATen",
["X", "Y"],
["Z"],
operator="add")
def ref(X, Y):
return [X + Y]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(inputs=hu.tensors(n=1), **hu.gcs)
def test_pow(self, inputs, gc, dc):
op = core.CreateOperator(
"ATen",
["S"],
["Z"],
operator="pow", exponent=2.0)
def ref(X):
return [np.square(X)]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
def test_sort(self, x, gc, dc):
inputs = [np.random.permutation(x)]
op = core.CreateOperator(
"ATen",
["S"],
["Z", "I"],
operator="sort")
def ref(X):
return [np.sort(X), np.argsort(X)]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(inputs=hu.tensors(n=1), **hu.gcs)
def test_sum(self, inputs, gc, dc):
op = core.CreateOperator(
"ATen",
["S"],
["Z"],
operator="sum")
def ref(X):
return [np.sum(X)]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(**hu.gcs)
def test_ones(self, gc, dc):
op = core.CreateOperator(
"ATen",
[],
["Z"],
operator="ones", type="float", size={2, 4})
def ref():
return [np.ones([2, 4])]
self.assertReferenceChecks(gc, op, [], ref)
if __name__ == "__main__":
import unittest
unittest.main()

View File

@ -0,0 +1,157 @@
# Using ONNX and ATen to export models from PyTorch to Caffe2
When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up
hitting operators that are not yet part of the ONNX specification. These may be
operators that haven't been standardized yet, or custom `torch.autograd.Function` types that
are specific to a network.
To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten)
that can run these tensor functions in a Caffe2 network after importing them through ONNX.
This guide explains how to configure Caffe2 and modify your PyTorch program to use
this functionality.
### Enable ATen in Caffe2
The ATen facility in Caffe2 is part of a contrib package and needs to be enabled
when you configure Caffe2 using cmake:
```
git clone https://github.com/caffe2/caffe2/
mkdir caffe2/build
cd caffe2/build
cmake -DUSE_ATEN=ON <other build options> ..
make install
```
### Describe How to Export a PyTorch Autograd Function using ATen
To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run
in the forward pass of a network. For each function in the trace, it calls that function's
`symbolic` method which describes how to construct the part of the ONNX graph
that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/master/torch/autograd/_functions/basic_ops.py#L59) for examples).
When equivalent ONNX operators do not exist, you can instead call any ATen function.
As an example let's assume we have an autograd function which computes `x*x+y`:
```
class MyFunction(Function):
@staticmethod
def forward(ctx, x, y):
return x*x + y
```
We can add a `symbolic` method to it like so:
```
class MyFunction(Function):
@staticmethod
def forward(ctx, x, y):
return x*x + y
@staticmethod
def symbolic(graph, x, y):
x2 = graph.at("mul", x, x)
r = graph.at("add", x2, y)
# x, y, x2, and r are 'Node' objects
# print(r) or print(graph) will print out a textual representation for debugging.
# this representation will be converted to ONNX protobufs on export.
return r
```
The function `graph.at` adds a new ATen op the computation graph.
You can call any ATen function using this facility. To do so,
first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
As an example, we might want to call the `pow` operator:
```
static inline Tensor pow(const Tensor & self, Scalar exponent);
```
We can translate this into the equivalent `graph.at` function:
```
def symbolic(graph, x):
graph.at("pow", x, exponent_f = 2.0) # compute x**2
```
Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar`
like `exponent` becomes a keyword argument that specify ONNX attributes.
Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings).
For methods, the first input is always the `this` Tensor in C++.
To call methods of ATen's `Type` objects, you provide an additional string attribute
that determines the type. For instance, `ones` creates a new constant tensor of all ones:
```
class Type {
...
virtual Tensor ones(IntList size) const;
...
};
```
From PyTorch it can be created by adding the type as an additional attribute:
```
def symbolic(graph, x):
return graph.at("ones", type_s="float", size_i=[2,4])
```
Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
## Putting it together
With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`:
```
class MyModule(nn.Module):
def forward(self, x, y):
# you can combine your ATen ops with standard onnx ones
x = nn.ReLU()(x)
return MyFunction.apply(x, y)
torch.onnx.export(MyModule(),
(Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
"output.onnx",
verbose=True)
```
This exports the following graph, which contains calls the `ATen` operator:
```
graph(%1 : Float(3, 4)
%2 : Float(3, 4)) {
%3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
%4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
%5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
return (%5);
}
```
The graph can then be imported using ONNX and run with Caffe2:
```
import onnx
import caffe2.python.onnx.backend
import numpy as np
graph = onnx.load("output.onnx")
a = np.random.randn(3, 2).astype(np.float32)
b = np.random.randn(3, 2).astype(np.float32)
prepared_backend = caffe2.python.onnx.backend.prepare(graph)
W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
c2_out = prepared_backend.run(W)[0]
x = np.maximum(a, 0)
r = x*x + b
np.testing.assert_array_almost_equal(r, c2_out)
```
### Code
For the full source code for this tutorial, see [sample.py](sample.py).

View File

@ -0,0 +1,54 @@
import numpy as np
from torch import nn
from torch.autograd import Variable, Function
import torch.onnx
import onnx
import caffe2.python.onnx.backend
class MyFunction(Function):
@staticmethod
def forward(ctx, x, y):
return x*x + y
@staticmethod
def symbolic(graph, x, y):
x2 = graph.at("mul", x, x)
r = graph.at("add", x2, y)
# x, y, x2, and r are 'Node' objects
# print(r) or print(graph) will print out a textual representation for debugging.
# this representation will be converted to ONNX protobufs on export.
return r
class MyModule(nn.Module):
def forward(self, x, y):
# you can combine your ATen ops with standard onnx ones
x = nn.ReLU()(x)
return MyFunction.apply(x, y)
torch.onnx.export(MyModule(),
(Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
"output.onnx",
verbose=True)
# prints the graph for debugging:
# graph(%1 : Float(3, 4)
# %2 : Float(3, 4)) {
# %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
# %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
# %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
# return (%5);
# }
graph = onnx.load("output.onnx")
a = np.random.randn(3, 4).astype(np.float32)
b = np.random.randn(3, 4).astype(np.float32)
prepared_backend = caffe2.python.onnx.backend.prepare(graph)
W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
c2_out = prepared_backend.run(W)[0]
x = np.maximum(a, 0)
r = x*x + b
np.testing.assert_array_almost_equal(r, c2_out)

274
caffe2/contrib/aten/gen_op.py Executable file
View File

@ -0,0 +1,274 @@
#!/bin/env python
# Copyright (c) 2016-present, Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
import sys
import yaml
import argparse
import os
from copy import deepcopy
parser = argparse.ArgumentParser()
parser.add_argument("--template_dir", default=".", help="where template.h is")
parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen",
help="where ATen yaml files are")
parser.add_argument("--output_prefix", default="", help="")
parser.add_argument(
"--install_dir", default=".", help="where to put generated file")
parser.add_argument("--third_party_root", default="", help="caffe2 third_party")
args, _ = parser.parse_known_args()
if args.third_party_root:
sys.path.append(os.path.join(args.third_party_root, "aten/src/ATen"))
from code_template import CodeTemplate as CT
else:
from src.ATen.code_template import CodeTemplate as CT
OP_TEMPLATE = CT.from_file(
os.path.join(args.template_dir, 'aten_op_template.h'))
try:
# use faster C loader if available
from yaml import CLoader as Loader
except ImportError:
from yaml import Loader
def write(filename, s):
with open(filename, "w") as f:
f.write(s)
def read(filename):
with open(filename, "r") as f:
return f.read()
def value_has_tensors(v):
# Sparse shouldn't appear in public API, seems to be temporary bug
return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type']
def value_is_tensor_type(v):
return value_has_tensors(v) and v['dynamic_type'] != 'TensorList'
# for each aten type, how do we handle a return value of that type?
RETURN_MAP = {
'Tensor': 'assignTo(Output(${offset}),${output});',
'Scalar': 'assignTo(Output(${offset}),*inferred_type, ${output});',
'bool': 'assignToValue<int64_t>(Output(${offset}),${output});',
'int64_t': 'assignToValue<int64_t>(Output(${offset}),${output});',
'std::vector<Tensor>': 'assignListStartingAt(${offset}, ${output});',
}
# for each non-Tensor aten argument, how to we read it from caffe2's
# attribute list. Most of these call runtime functions defined in the
# template class.
ARGUMENT_MAP = {
'Scalar': 'at::Scalar ${arg} = readScalarAttribute("${arg}");',
'bool': 'bool ${arg} = readAttribute<int64_t>("${arg}");',
'int': 'int ${arg} = readAttribute<int64_t>("${arg}");',
'double': 'double ${arg} = readAttribute<float>("${arg}");',
'int64_t': 'int64_t ${arg} = readAttribute<int64_t>("${arg}");',
'IntList': 'auto ${arg} = readIntList("${arg}");',
'std::array<bool, 2>': 'auto ${arg} = readBoolMask<2>("${arg}");',
'std::array<bool, 3>': 'auto ${arg} = readBoolMask<3>("${arg}");',
}
def expand(o):
num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments'])
results = [o]
for i in range(0, num_defaults):
# last num_default values should be default
assert('default' in o['arguments'][-(i + 1)])
v = deepcopy(o)
v['arguments'] = v['arguments'][:-(i + 1)]
results.append(v)
return results
# filter the list of declarations removing things we cannot support
def supports(o):
# skip all in-place operators for now since aten cannot Resize
# caffe2 memory inside an operator
if o['inplace']:
return False
# _out variants also work in-place on arguments taken as destinations
# we also cannot handle these because aten cannot resize caffe2 Tensors
if "_out" in o['name']:
return False
# skip return types we cannot handle
for ret in o['returns']:
if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP:
print("Skipping {} Because of Ret: {} ({})".format(
o['name'], ret['type'], ret['dynamic_type']))
return False
# skip arguments we cannot handle
for arg in o['arguments']:
if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP:
print("Skipping {} Because of Arg: {} ({}) ".format(
o['name'], arg['type'], arg['dynamic_type']))
return False
return True
# template for each potential operator.
# each operator has an integer 'key' associated with it, and
# a lambda that defines the operator
# non-tensor attributes are created in ${initialization}
# and then saved as arguments to the lambda
# Inputs/Outputs are read inside the lambda
OPTION_TEMPLATE = CT("""\
case ${key}: { // ${name}
${initialization}
run_op = [=] {
${statements}
auto the_result = ${invocation};
${assignments}
return true;
};
} break;
""")
def get_output(o, i):
if len(o['returns']) == 1:
return 'the_result'
else:
return 'std::get<{}>(the_result)'.format(i)
def attribute_names(o):
return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)])
def required_attribute_names(o):
return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a])
def self_as_first_argument(arguments):
return ([a for a in arguments if a['name'] == 'self'] +
[a for a in arguments if a['name'] != 'self'])
def get_num_inputs(o):
args = 0
for a in o['arguments']:
if a['type'] == 'TensorList':
return '*'
elif value_has_tensors(a):
args += 1
return str(args)
if __name__ == '__main__':
decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded)]
top_env = {
'mappings': [],
'implementations': [],
}
seen = set()
key = 0
for o in filtered:
# [DESCRIPTORS]
# each option is associated with a descriptor string that is used
# to figure out which version of an op is being used:
# The format is:
# opname-num_inputs-attribute_1-attribute2
# Example:
# lerp-2-weight
# the operator lerp takes 2 arguments and has the attribute weight
attr_names = attribute_names(o)
num_inputs = get_num_inputs(o)
descriptor = '-'.join([o['name']] + attr_names + [num_inputs])
if descriptor in seen:
continue
seen.add(descriptor)
# map from descriptor string to the integer key in the switch statements
# that initializes the operators
top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key))
env = {
'name': o['name'],
'statements': [],
'arguments': [],
'assignments': [],
'initialization': [],
'key': str(key),
}
defined_inferred_type = False
if 'Tensor' in o['method_of']:
# make sure 'self' is the first argument. currently Declarations.yaml
# does not always do this. Instead it keeps the argument list the same order
# as the Type method.
o['arguments'] = self_as_first_argument(o['arguments'])
elif 'namespace' not in o['method_of']:
# methods on type like 'ones' or 'zeros' always take a
# string attribute that is translated into the at::Type object
# e.g. "Float" is at::kFloat
assert('Type' in o['method_of'])
defined_inferred_type = True
env['initialization'].append(
'auto inferred_type = readTypeAttribute("type");')
i = 0
for arg in o['arguments']:
env['arguments'].append(arg['name'])
if arg['type'] == 'TensorList':
env['statements'].append(
'auto {} = loadInputsAtOffset({});'.format(arg['name'], i))
elif value_is_tensor_type(arg):
assert(i != '*') # tensor list is not last argument
# load tensor inputs from Caffe2
env['statements'].append(
"auto {} = loadInput({});".format(arg['name'], i))
i += 1
if arg['dynamic_type'] == 'Tensor' and not defined_inferred_type:
# first tensor input is used to define the output type.
defined_inferred_type = True
env['statements'].append(
'auto inferred_type = &({}.type());'.format(
arg['name']))
else:
init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
env['initialization'].append(init)
for i, r in enumerate(o['returns']):
t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'Tensor']
assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
env['assignments'].append(assignment)
if 'Tensor' in o['method_of']:
env['invocation'] = "self.{}({})".format(
o['name'], ', '.join(env['arguments'][1:]))
elif 'namespace' in o['method_of']:
env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
else:
assert('Type' in o['method_of'])
env['invocation'] = CT(
'inferred_type->${name}(${arguments})').substitute(env)
top_env['implementations'].append(OPTION_TEMPLATE.substitute(env))
key += 1
write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env))

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,7 @@
# cuda-convnet2
Automatically exported from code.google.com/p/cuda-convnet2
You can read the documentation in two ways:
1. On this site: go to branches > wiki.
2. On Google Code (for now?): https://code.google.com/p/cuda-convnet2/

View File

@ -0,0 +1,50 @@
#!/bin/sh
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################
# Fill in the below environment variables.
#
# If you're not sure what these paths should be,
# you can use the find command to try to locate them.
# For example, NUMPY_INCLUDE_PATH contains the file
# arrayobject.h. So you can search for it like this:
#
# find /usr -name arrayobject.h
#
# (it'll almost certainly be under /usr)
# CUDA toolkit installation directory.
export CUDA_INSTALL_PATH=/usr/local/cuda
# Python include directory. This should contain the file Python.h, among others.
export PYTHON_INCLUDE_PATH=/usr/include/python2.7
# Numpy include directory. This should contain the file arrayobject.h, among others.
export NUMPY_INCLUDE_PATH=/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/
# ATLAS library directory. This should contain the file libcblas.so, among others.
export ATLAS_LIB_PATH=/usr/lib/atlas-base
# You don't have to change these:
export LD_LIBRARY_PATH=$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH
export CUDA_SDK_PATH=$CUDA_INSTALL_PATH/samples
export PATH=$PATH:$CUDA_INSTALL_PATH/bin
cd util && make numpy=1 -j $* && cd ..
cd nvmatrix && make -j $* && cd ..
cd cudaconv3 && make -j $* && cd ..
cd cudaconvnet && make -j $* && cd ..
cd make-data/pyext && make -j $* && cd ../..

View File

@ -0,0 +1,291 @@
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from python_util.data import *
import numpy.random as nr
import numpy as n
import random as r
from time import time
from threading import Thread
from math import sqrt
import sys
#from matplotlib import pylab as pl
from PIL import Image
from StringIO import StringIO
from time import time
import itertools as it
class JPEGBatchLoaderThread(Thread):
def __init__(self, dp, batch_num, label_offset, list_out):
Thread.__init__(self)
self.list_out = list_out
self.label_offset = label_offset
self.dp = dp
self.batch_num = batch_num
@staticmethod
def load_jpeg_batch(rawdics, dp, label_offset):
if type(rawdics) != list:
rawdics = [rawdics]
nc_total = sum(len(r['data']) for r in rawdics)
jpeg_strs = list(it.chain.from_iterable(rd['data'] for rd in rawdics))
labels = list(it.chain.from_iterable(rd['labels'] for rd in rawdics))
img_mat = n.empty((nc_total * dp.data_mult, dp.inner_pixels * dp.num_colors), dtype=n.float32)
lab_mat = n.zeros((nc_total, dp.get_num_classes()), dtype=n.float32)
dp.convnet.libmodel.decodeJpeg(jpeg_strs, img_mat, dp.img_size, dp.inner_size, dp.test, dp.multiview)
lab_vec = n.tile(n.asarray([(l[nr.randint(len(l))] if len(l) > 0 else -1) + label_offset for l in labels], dtype=n.single).reshape((nc_total, 1)), (dp.data_mult,1))
for c in xrange(nc_total):
lab_mat[c, [z + label_offset for z in labels[c]]] = 1
lab_mat = n.tile(lab_mat, (dp.data_mult, 1))
return {'data': img_mat[:nc_total * dp.data_mult,:],
'labvec': lab_vec[:nc_total * dp.data_mult,:],
'labmat': lab_mat[:nc_total * dp.data_mult,:]}
def run(self):
rawdics = self.dp.get_batch(self.batch_num)
p = JPEGBatchLoaderThread.load_jpeg_batch(rawdics,
self.dp,
self.label_offset)
self.list_out.append(p)
class ColorNoiseMakerThread(Thread):
def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
Thread.__init__(self)
self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
self.num_noise = num_noise
self.list_out = list_out
def run(self):
noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T)
self.list_out.append(noise)
class ImageDataProvider(LabeledDataProvider):
def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
self.data_mean = self.batch_meta['data_mean'].astype(n.single)
self.color_eig = self.batch_meta['color_pca'][1].astype(n.single)
self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)]
self.color_noise_coeff = dp_params['color_noise']
self.num_colors = 3
self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
self.mini = dp_params['minibatch_size']
self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.img_size
self.inner_pixels = self.inner_size **2
self.border_size = (self.img_size - self.inner_size) / 2
self.multiview = dp_params['multiview_test'] and test
self.num_views = 5*2
self.data_mult = self.num_views if self.multiview else 1
self.batch_size = self.batch_meta['batch_size']
self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset']
self.scalar_mean = dp_params['scalar_mean']
# Maintain pointers to previously-returned data matrices so they don't get garbage collected.
self.data = [None, None] # These are pointers to previously-returned data matrices
self.loader_thread, self.color_noise_thread = None, None
self.convnet = dp_params['convnet']
self.num_noise = self.batch_size
self.batches_generated, self.loaders_started = 0, 0
self.data_mean_crop = self.data_mean.reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2))
if self.scalar_mean >= 0:
self.data_mean_crop = self.scalar_mean
def showimg(self, img):
from matplotlib import pylab as pl
pixels = img.shape[0] / 3
size = int(sqrt(pixels))
img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1)
pl.imshow(img, interpolation='nearest')
pl.show()
def get_data_dims(self, idx=0):
if idx == 0:
return self.inner_size**2 * 3
if idx == 2:
return self.get_num_classes()
return 1
def start_loader(self, batch_idx):
self.load_data = []
self.loader_thread = JPEGBatchLoaderThread(self,
self.batch_range[batch_idx],
self.label_offset,
self.load_data)
self.loader_thread.start()
def start_color_noise_maker(self):
color_noise_list = []
self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
self.color_noise_thread.start()
return color_noise_list
def set_labels(self, datadic):
pass
def get_data_from_loader(self):
if self.loader_thread is None:
self.start_loader(self.batch_idx)
self.loader_thread.join()
self.data[self.d_idx] = self.load_data[0]
self.start_loader(self.get_next_batch_idx())
else:
# Set the argument to join to 0 to re-enable batch reuse
self.loader_thread.join()
if not self.loader_thread.is_alive():
self.data[self.d_idx] = self.load_data[0]
self.start_loader(self.get_next_batch_idx())
#else:
# print "Re-using batch"
self.advance_batch()
def add_color_noise(self):
# At this point the data already has 0 mean.
# So I'm going to add noise to it, but I'm also going to scale down
# the original data. This is so that the overall scale of the training
# data doesn't become too different from the test data.
s = self.data[self.d_idx]['data'].shape
cropped_size = self.get_data_dims(0) / 3
ncases = s[0]
if self.color_noise_thread is None:
self.color_noise_list = self.start_color_noise_maker()
self.color_noise_thread.join()
self.color_noise = self.color_noise_list[0]
self.color_noise_list = self.start_color_noise_maker()
else:
self.color_noise_thread.join(0)
if not self.color_noise_thread.is_alive():
self.color_noise = self.color_noise_list[0]
self.color_noise_list = self.start_color_noise_maker()
self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases*3, cropped_size))
self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1))
self.data[self.d_idx]['data'] += self.color_noise * self.color_noise_coeff
self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases, 3* cropped_size))
self.data[self.d_idx]['data'] *= 1.0 / (1.0 + self.color_noise_coeff) # <--- NOTE: This is the slow line, 0.25sec. Down from 0.75sec when I used division.
def get_next_batch(self):
self.d_idx = self.batches_generated % 2
epoch, batchnum = self.curr_epoch, self.curr_batchnum
self.get_data_from_loader()
# Subtract mean
self.data[self.d_idx]['data'] -= self.data_mean_crop
if self.color_noise_coeff > 0 and not self.test:
self.add_color_noise()
self.batches_generated += 1
return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labvec'].T, self.data[self.d_idx]['labmat'].T]
# Takes as input an array returned by get_next_batch
# Returns a (numCases, imgSize, imgSize, 3) array which can be
# fed to pylab for plotting.
# This is used by shownet.py to plot test case predictions.
def get_plottable_data(self, data, add_mean=True):
mean = self.data_mean_crop.reshape((data.shape[0],1)) if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.reshape((data.shape[0],1))
return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
class CIFARDataProvider(LabeledDataProvider):
def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
self.img_size = 32
self.num_colors = 3
self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.batch_meta['img_size']
self.border_size = (self.img_size - self.inner_size) / 2
self.multiview = dp_params['multiview_test'] and test
self.num_views = 9
self.scalar_mean = dp_params['scalar_mean']
self.data_mult = self.num_views if self.multiview else 1
self.data_dic = []
for i in batch_range:
self.data_dic += [unpickle(self.get_data_file_name(i))]
self.data_dic[-1]["labels"] = n.require(self.data_dic[-1]['labels'], dtype=n.single)
self.data_dic[-1]["labels"] = n.require(n.tile(self.data_dic[-1]["labels"].reshape((1, n.prod(self.data_dic[-1]["labels"].shape))), (1, self.data_mult)), requirements='C')
self.data_dic[-1]['data'] = n.require(self.data_dic[-1]['data'] - self.scalar_mean, dtype=n.single, requirements='C')
self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)]
self.batches_generated = 0
self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1))
def get_next_batch(self):
epoch, batchnum = self.curr_epoch, self.curr_batchnum
self.advance_batch()
bidx = batchnum - self.batch_range[0]
cropped = self.cropped_data[self.batches_generated % 2]
self.__trim_borders(self.data_dic[bidx]['data'], cropped)
cropped -= self.data_mean
self.batches_generated += 1
return epoch, batchnum, [cropped, self.data_dic[bidx]['labels']]
def get_data_dims(self, idx=0):
return self.inner_size**2 * self.num_colors if idx == 0 else 1
# Takes as input an array returned by get_next_batch
# Returns a (numCases, imgSize, imgSize, 3) array which can be
# fed to pylab for plotting.
# This is used by shownet.py to plot test case predictions.
def get_plottable_data(self, data):
return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
def __trim_borders(self, x, target):
y = x.reshape(self.num_colors, self.img_size, self.img_size, x.shape[1])
if self.test: # don't need to loop over cases
if self.multiview:
start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2),
(self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2),
(self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)]
end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
for i in xrange(self.num_views):
target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1]))
else:
pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now
target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1]))
else:
for c in xrange(x.shape[1]): # loop over cases
startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
endY, endX = startY + self.inner_size, startX + self.inner_size
pic = y[:,startY:endY,startX:endX, c]
if nr.randint(2) == 0: # also flip the image with 50% probability
pic = pic[:,:,::-1]
target[:,c] = pic.reshape((self.get_data_dims(),))
class DummyConvNetLogRegDataProvider(LabeledDummyDataProvider):
def __init__(self, data_dim):
LabeledDummyDataProvider.__init__(self, data_dim)
self.img_size = int(sqrt(data_dim/3))
def get_next_batch(self):
epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
dic = {'data': dic[0], 'labels': dic[1]}
print dic['data'].shape, dic['labels'].shape
return epoch, batchnum, [dic['data'], dic['labels']]
# Returns the dimensionality of the two data matrices returned by get_next_batch
def get_data_dims(self, idx=0):
return self.batch_meta['num_vis'] if idx == 0 else 1

View File

@ -0,0 +1,289 @@
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as n
import numpy.random as nr
import random as r
from python_util.util import *
from python_util.data import *
from python_util.options import *
from python_util.gpumodel import *
import sys
import math as m
import layer as lay
from convdata import ImageDataProvider, CIFARDataProvider, DummyConvNetLogRegDataProvider
from os import linesep as NL
import copy as cp
import os
class Driver(object):
def __init__(self, convnet):
self.convnet = convnet
def on_start_batch(self, batch_data, train):
pass
def on_finish_batch(self):
pass
class GradCheckDriver(Driver):
def on_start_batch(self, batch_data, train):
data = batch_data[2]
self.convnet.libmodel.checkGradients(data)
class TrainingDriver(Driver):
def on_start_batch(self, batch_data, train):
data = batch_data[2]
self.convnet.libmodel.startBatch(data, self.convnet.get_progress(), not train)
class MultiviewTestDriver(TrainingDriver):
def on_start_batch(self, batch_data, train):
self.write_output = False
if train:
TrainingDriver.on_start_batch(self, batch_data, train)
else:
data = batch_data[2]
num_views = self.convnet.test_data_provider.num_views
if self.convnet.test_out != "" and self.convnet.logreg_name != "":
self.write_output = True
self.test_file_name = os.path.join(self.convnet.test_out, 'test_preds_%d' % batch_data[1])
self.probs = n.zeros((data[0].shape[1]/num_views, self.convnet.test_data_provider.get_num_classes()), dtype=n.single)
self.convnet.libmodel.startMultiviewTest(data, num_views, self.probs, self.convnet.logreg_name)
else:
self.convnet.libmodel.startMultiviewTest(data, num_views)
def on_finish_batch(self):
if self.write_output:
if not os.path.exists(self.convnet.test_out):
os.makedirs(self.convnet.test_out)
pickle(self.test_file_name, {'data': self.probs,
'note': 'generated from %s' % self.convnet.save_file})
class FeatureWriterDriver(Driver):
def __init__(self, convnet):
Driver.__init__(self, convnet)
self.last_batch = convnet.test_batch_range[-1]
def on_start_batch(self, batch_data, train):
if train:
raise ModelStateException("FeatureWriter must be used in conjunction with --test-only=1. It writes test data features.")
self.batchnum, self.data = batch_data[1], batch_data[2]
if not os.path.exists(self.convnet.feature_path):
os.makedirs(self.convnet.feature_path)
self.num_ftrs = self.convnet.layers[self.convnet.write_features]['outputs']
self.ftrs = n.zeros((self.data[0].shape[1], self.num_ftrs), dtype=n.single)
self.convnet.libmodel.startFeatureWriter(self.data, [self.ftrs], [self.convnet.write_features])
def on_finish_batch(self):
path_out = os.path.join(self.convnet.feature_path, 'data_batch_%d' % self.batchnum)
pickle(path_out, {'data': self.ftrs, 'labels': self.data[1]})
print "Wrote feature file %s" % path_out
if self.batchnum == self.last_batch:
pickle(os.path.join(self.convnet.feature_path, 'batches.meta'), {'source_model':self.convnet.load_file,
'num_vis':self.num_ftrs,
'batch_size': self.convnet.test_data_provider.batch_meta['batch_size']})
class ConvNet(IGPUModel):
def __init__(self, op, load_dic, dp_params={}):
filename_options = []
for v in ('color_noise', 'multiview_test', 'inner_size', 'scalar_mean', 'minibatch_size'):
dp_params[v] = op.get_value(v)
IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params)
def import_model(self):
lib_name = "cudaconvnet._ConvNet"
print "========================="
print "Importing %s C++ module" % lib_name
self.libmodel = __import__(lib_name,fromlist=['_ConvNet'])
def init_model_lib(self):
self.libmodel.initModel(self.layers,
self.device_ids,
self.minibatch_size,
self.conserve_mem)
def init_model_state(self):
ms = self.model_state
layers = ms['layers'] if self.loaded_from_checkpoint else {}
ms['layers'] = lay.LayerParser.parse_layers(os.path.join(self.layer_path, self.layer_def),
os.path.join(self.layer_path, self.layer_params), self, layers=layers)
self.do_decouple_conv()
self.do_unshare_weights()
self.op.set_value('conv_to_local', [], parse=False)
self.op.set_value('unshare_weights', [], parse=False)
self.set_driver()
def do_decouple_conv(self):
# Convert convolutional layers to local
if len(self.op.get_value('conv_to_local')) > 0:
for lname in self.op.get_value('conv_to_local'):
if self.model_state['layers'][lname]['type'] == 'conv':
lay.LocalLayerParser.conv_to_local(self.model_state['layers'], lname)
def do_unshare_weights(self):
# Decouple weight matrices
if len(self.op.get_value('unshare_weights')) > 0:
for name_str in self.op.get_value('unshare_weights'):
if name_str:
name = lay.WeightLayerParser.get_layer_name(name_str)
if name is not None:
name, idx = name[0], name[1]
if name not in self.model_state['layers']:
raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name)
layer = self.model_state['layers'][name]
lay.WeightLayerParser.unshare_weights(layer, self.model_state['layers'], matrix_idx=idx)
else:
raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str)
def set_driver(self):
if self.op.get_value('check_grads'):
self.driver = GradCheckDriver(self)
elif self.op.get_value('multiview_test'):
self.driver = MultiviewTestDriver(self)
elif self.op.get_value('write_features'):
self.driver = FeatureWriterDriver(self)
else:
self.driver = TrainingDriver(self)
def fill_excused_options(self):
if self.op.get_value('check_grads'):
self.op.set_value('save_path', '')
self.op.set_value('train_batch_range', '0')
self.op.set_value('test_batch_range', '0')
self.op.set_value('data_path', '')
# Make sure the data provider returned data in proper format
def parse_batch_data(self, batch_data, train=True):
if max(d.dtype != n.single for d in batch_data[2]):
raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.")
return batch_data
def start_batch(self, batch_data, train=True):
self.driver.on_start_batch(batch_data, train)
def finish_batch(self):
ret = IGPUModel.finish_batch(self)
self.driver.on_finish_batch()
return ret
def print_iteration(self):
print "%d.%d (%.2f%%)..." % (self.epoch, self.batchnum, 100 * self.get_progress()),
def print_train_time(self, compute_time_py):
print "(%.3f sec)" % (compute_time_py)
def print_costs(self, cost_outputs):
costs, num_cases = cost_outputs[0], cost_outputs[1]
children = set()
for errname in costs:
if sum(errname in self.layers[z]['children'] for z in costs) == 0:
# print self.layers[errname]['children']
for child in set(self.layers[errname]['children']) & set(costs.keys()):
costs[errname] = [v + u for v, u in zip(costs[errname], costs[child])]
children.add(child)
filtered_costs = eval(self.layers[errname]['outputFilter'])(costs[errname], num_cases)
print "%s: " % errname,
if 'outputFilterFormatter' not in self.layers[errname]:
print ", ".join("%.6f" % v for v in filtered_costs),
else:
print eval(self.layers[errname]['outputFilterFormatter'])(self,filtered_costs),
if m.isnan(filtered_costs[0]) or m.isinf(filtered_costs[0]):
print "<- error nan or inf!"
sys.exit(1)
for c in children:
del costs[c]
def print_train_results(self):
self.print_costs(self.train_outputs[-1])
def print_test_status(self):
pass
def print_test_results(self):
print NL + "======================Test output======================"
self.print_costs(self.test_outputs[-1])
if not self.test_only:
print NL + "----------------------Averages-------------------------"
self.print_costs(self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):]))
print NL + "-------------------------------------------------------",
for name,val in sorted(self.layers.items(), key=lambda x: x[1]['id']): # This is kind of hacky but will do for now.
l = self.layers[name]
if 'weights' in l:
wscales = [(l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))]
print ""
print NL.join("Layer '%s' weights[%d]: %e [%e] [%e]" % (s[0], s[1], s[2], s[3], s[3]/s[2] if s[2] > 0 else 0) for s in wscales),
print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))),
print ""
def conditional_save(self):
self.save_state()
def aggregate_test_outputs(self, test_outputs):
test_outputs = cp.deepcopy(test_outputs)
num_cases = sum(t[1] for t in test_outputs)
for i in xrange(1 ,len(test_outputs)):
for k,v in test_outputs[i][0].items():
for j in xrange(len(v)):
test_outputs[0][0][k][j] += test_outputs[i][0][k][j]
return (test_outputs[0][0], num_cases)
@classmethod
def get_options_parser(cls):
op = IGPUModel.get_options_parser()
op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128)
op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=False)
op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file")
op.add_option("layer-path", "layer_path", StringOptionParser, "Layer file path prefix", default="")
op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path', 'save_file_override', 'train_batch_range','test_batch_range'])
op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0)
op.add_option("inner-size", "inner_size", IntegerOptionParser, "Cropped DP: crop size (0 = don't crop)", default=0, set_once=True)
op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[])
op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[])
op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0)
op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0)
op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test'])
op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="")
op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract this scalar from image (-1 = don't)", default=-1)
op.add_option("write-features", "write_features", StringOptionParser, "Write test data features from given layer", default="", requires=['feature-path'])
op.add_option("feature-path", "feature_path", StringOptionParser, "Write test data features to this path (to be used with --write-features)", default="")
op.delete_option('max_test_err')
op.options["testing_freq"].default = 57
op.options["num_epochs"].default = 50000
op.options['dp_type'].default = None
DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDataProvider)
DataProvider.register_data_provider('image', 'JPEG-encoded image data provider', ImageDataProvider)
DataProvider.register_data_provider('cifar', 'CIFAR-10 data provider', CIFARDataProvider)
return op
if __name__ == "__main__":
# nr.seed(6)
op = ConvNet.get_options_parser()
op, load_dic = IGPUModel.parse_options(op)
model = ConvNet(op, load_dic)
model.start()

View File

@ -0,0 +1,108 @@
################################################################################
#
# Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
#
# NOTICE TO USER:
#
# This source code is subject to NVIDIA ownership rights under U.S. and
# international Copyright laws.
#
# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
# OR PERFORMANCE OF THIS SOURCE CODE.
#
# U.S. Government End Users. This source code is a "commercial item" as
# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
# "commercial computer software" and "commercial computer software
# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
# and is provided to the U.S. Government only as a commercial end item.
# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
# source code with only those rights set forth herein.
#
################################################################################
# Location of the CUDA Toolkit binaries and libraries
CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include
CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin
CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64
# Common binaries
NVCC = $(CUDA_BIN_PATH)/nvcc
GCC = g++
AR = ar
# CUDA code generation flags
GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
GENCODE_FLAGS := $(GENCODE_SM35)
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m64
NVCCFLAGS := -m64
# Debug build flags
ifeq ($(dbg),1)
CCFLAGS += -g
NVCCFLAGS += -g -G
DBG := debug
else
DBG := release
NVCCFLAGS += -O3
CCFLAGS += -O3
endif
# Add profiler output
ifeq ($(prof),1)
NVCCFLAGS += --ptxas-options=-v
endif
TARGETDIR := ./bin/$(DBG)
OBJDIR := ./obj/$(DBG)
########## USER STUFF ###########
LDFLAGS += -L../util -lutilpy -L../nvmatrix -lnvmatrix -lcublas
INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include
CUFILES := $(shell find . -name "*.cu")
CU_DEPS := $(shell find . -name "*.cuh")
CCFILES := $(shell find . -name "*.cpp")
C_DEPS := $(shell find . -name "*.h")
NVCCFLAGS += --compiler-options '-fPIC'
LDFLAGS += -shared
CCFLAGS += -fPIC
TARGET := $(TARGETDIR)/libcudaconv.so
################################################################################
# Set up target and object files
################################################################################
OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
# Target rules
all: makedirs $(TARGET)
$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
$(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
$(TARGET): $(OBJS)
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS)
ln -sf $(TARGET) .
makedirs:
mkdir -p $(TARGETDIR)
mkdir -p $(OBJDIR)/src
clean:
rm -rf ./obj

View File

@ -0,0 +1,648 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CONV_UTIL_CUH
#define CONV_UTIL_CUH
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "caffe2/core/context_gpu.h"
#ifndef MIN
#define MIN(a, b) ((a) > (b) ? (b) : (a))
#endif
#ifndef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
int subsX, int startX, int strideX, int outputsX);
void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum);
void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum,
float scaleTargets, float scaleOutput);
void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
int subsX, int startX, int strideX, int outputsX, float scaleTargets, float scaleOutput);
void convResponseNorm(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv);
void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput);
void convContrastNorm(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv);
void convContrastNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& meanDiffs, NVMatrix& acts, NVMatrix& target, int numFilters,
int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput);
void convGaussianBlur(NVMatrix& images, NVMatrix& filter, NVMatrix& target, bool horiz, int numChannels,
float scaleTargets, float scaleOutputs);
void convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX,
int strideX, float scaleTargets, float scaleOutput);
void convBedOfNailsUndo(NVMatrix& actsGrad, NVMatrix& target, int numChannels, int imgSize,
int startX, int strideX, float scaleTargets, float scaleOutput);
void convResizeBilinear(NVMatrix& images, NVMatrix& target, int imgSize, int tgtSize, float scale);
void convRGBToYUV(NVMatrix& images, NVMatrix& target);
void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center);
void convCrop(NVMatrix& imgs, NVMatrix& target, int imgSize, int tgtSize, int startY, int startX);
void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm);
void convContrastNormCrossMap(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& target,
int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked);
void convResponseNormCrossMapUndo(NVMatrix& outGrads, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
int sizeF, float addScale, float powScale, float minDiv, bool blocked, float scaleTargets, float scaleOutput);
void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale,
float powScale, bool blocked);
void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale,
float powScale, float minDiv, bool blocked);
void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize);
void convCrossMapMaxPoolUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
const int imgSize, const int startF, const int poolSize,
const int stride, const float scaleTargets, const float scaleOutputs);
cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor);
template<bool sum>
class AvgPooler {
public:
__device__ inline float operator()(const float a, const float b) const {
return a + b;
}
__device__ inline float getBaseValue() const {
return 0;
}
__device__ inline float output(const float a, const int regionSize) const {
return sum ? a : (a / regionSize);
}
};
class MaxPooler {
public:
__device__ inline float operator()(const float a, const float b) const {
return fmaxf(a, b);
}
__device__ inline float getBaseValue() const {
return -2e38;
}
__device__ inline float output(const float a, const int regionSize) const {
return a;
}
};
class MaxAbsPooler {
public:
__device__ inline float operator()(const float a, const float b) const {
return fabsf(a) > fabsf(b) ? a : b;
}
__device__ inline float getBaseValue() const {
return 0.0f;
}
__device__ inline float output(const float a, const int regionSize) const {
return a;
}
};
/*
* Block size B_YxB_X
* blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
* blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
*
* So each block does one output for some number of images/filters.
*
* threadIdx.x determines img idx
* threadIdx.y determines filter idx
*
* imgs: (numFilters, imgPixels, numImages)
* target: (numFilters, numOutputs, numImages)
*
* numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
*/
template<class Agg, int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
__global__ void kLocalPool(float* imgs, float* target, const int imgSize, const int numFilters,
const int numImages, const int subsX, const int startX, const int strideX,
const int outputsX, Agg agg) {
const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
const int numFilterBlocks = DIVUP(numFilters, B_Y*filtersPerThread);
const int outputIdxX = blockIdx.x / numImgBlocks;
const int outputIdxY = blockIdx.y / numFilterBlocks;
const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
const int myFilterIdx = (blockFilterIdx + threadIdx.y*filtersPerThread);
if (myFilterIdx >= numFilters) {
return;
}
const int outputIdx = outputIdxY * outputsX + outputIdxX;
const int numOutputs = outputsX * outputsX;
const int imgPixels = imgSize * imgSize;
const int startImgPxX = startX + outputIdxX * strideX;
const int startImgPxY = startX + outputIdxY * strideX;
const int imgIdx = blockImgIdx + threadIdx.x;
imgs += myFilterIdx * imgPixels * numImages + imgIdx;
target += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
float prod[filtersPerThread][imgsPerThread];
#pragma unroll
for (int f = 0; f < filtersPerThread; f++) {
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
prod[f][i] = agg.getBaseValue();
}
}
const int loopStartY = MAX(0, startImgPxY);
const int loopStartX = MAX(0, startImgPxX);
const int loopEndY = MIN(imgSize, startImgPxY + subsX);
const int loopEndX = MIN(imgSize, startImgPxX + subsX);
const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX);
for (int y = loopStartY; y < loopEndY; y++) {
for (int x = loopStartX; x < loopEndX; x++) {
const int imgPx = y * imgSize + x;
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
#pragma unroll
for (int f = 0; f < filtersPerThread; f++) {
prod[f][i] = agg(prod[f][i], imgs[(f * imgPixels + imgPx) * numImages + i * B_X]);
}
}
}
}
}
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
#pragma unroll
for (int f = 0; f < filtersPerThread; f++) {
target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize);
}
}
}
}
/*
* Block size B_YxB_X
* blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
* blockIdx.y determines pixel.y, output idx in batches of B_Y
*
* So each block does one pixel for some number of images/filters.
*
* threadIdx.x determines img idx
* threadIdx.y determines output idx
*
* imgs: (numFilters, imgPixels, numImages)
* target: (numOutputs, imgPixels, numImages) (out)
*
* numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
*/
template<class Agg, int B_Y, int B_X, int imgsPerThread, bool checkCaseBounds>
__global__ void kPoolCrossMap(float* imgs, float* target, const int imgSize,
const int numFilters, const int numImages, const int startF, const int poolSize,
const int numOutputs, const int stride, Agg agg) {
const int imgPixels = imgSize * imgSize;
const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
// const int numOutputs = DIVUP(numFilters, stride);
const int numOutputBlocks = DIVUP(numOutputs,B_Y);
const int pxIdxX = blockIdx.x / numImgBlocks;
const int pxIdxY = blockIdx.y / numOutputBlocks;
const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
const int outputIdx = (blockIdx.y % numOutputBlocks) * B_Y + threadIdx.y;
// const int filterIdx = outputIdx * stride;
const int pxIdx = pxIdxY * imgSize + pxIdxX;
const int imgIdx = blockImgIdx + threadIdx.x;
if (outputIdx < numOutputs) {
imgs += (pxIdx) * numImages + imgIdx;
target += (outputIdx * imgPixels + pxIdx) * numImages + imgIdx;
float prod[imgsPerThread];
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
prod[i] = agg.getBaseValue();
}
}
const int myStartF = startF + outputIdx * stride;
const int loopStartF = max(0, myStartF);
const int loopEndF = min(numFilters, myStartF + poolSize);
for (int f = loopStartF; f < loopEndF; ++f) {
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
prod[i] = agg(prod[i], imgs[f * imgPixels * numImages + i * B_X]);
}
}
}
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
target[i * B_X] = agg.output(prod[i], poolSize);
}
}
}
}
/*
* imgs: (numFilters, imgPixels, numImages)
* target: (numOutputs, imgPixels, numImages)
*/
template<class Pooler>
void convPoolCrossMap(NVMatrix& images, NVMatrix& target, const int startF, const int poolSize,
const int numOutputs, const int stride, const int imgSize, Pooler pooler) {
int numImages = images.getNumCols();
int imgPixels = imgSize * imgSize;
int numFilters = images.getNumRows() / imgPixels;
assert(images.getNumRows() == numFilters * imgPixels);
assert(!images.isTrans());
assert(!target.isTrans());
assert(images.isContiguous());
// assert(numFilters % 4 == 0);
// assert(numImages % 128 == 0);
assert(stride <= poolSize);
assert(startF <= 0);
assert(startF + (numOutputs-1) * stride + poolSize >= numFilters); // All filters must be covered
cudaStream_t stream = NVMatrix::getDefaultStream();
target.resize(imgPixels*numOutputs, numImages);
int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
dim3 threads(32, 4);
dim3 blocks(imgSize * DIVUP(numImages, threads.x * imgsPerThread), imgSize * DIVUP(numOutputs, threads.y));
bool checkCaseBounds = numImages % (threads.x*imgsPerThread) != 0;
if (!checkCaseBounds) {
if (imgsPerThread == 4) {
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 4, false>, cudaFuncCachePreferShared);
kPoolCrossMap<Pooler, 4, 32, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
} else if (imgsPerThread == 2) {
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 2, false>, cudaFuncCachePreferShared);
kPoolCrossMap<Pooler, 4, 32, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
} else if (imgsPerThread == 1) {
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 1, false>, cudaFuncCachePreferShared);
kPoolCrossMap<Pooler, 4, 32, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
}
} else {
if (imgsPerThread == 1) {
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 1, true>, cudaFuncCachePreferShared);
kPoolCrossMap<Pooler, 4, 32, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
} else {
assert(false);
}
}
getLastCudaError("convPoolCrossMap: kernel execution failed");
}
/*
* Block size 16xB_X
* blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread
* blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread
*
* So each block does a 4x4 region for some number of images/filters.
*
* threadIdx.x determines img idx
* threadIdx.y determines pixel idx
*
* imgs: (numFilters, imgPixels, numImages)
* target: (numFilters, numOutputs, numImages)
*
* B_X one of 8, 16, 32
* imgsPerThread one of 1, 2, 4, 8, 16
*
* B_XximgsPerThread MUST be divisible by 32.
* Number of filters MUST be divisible by filtersPerThread.
*
* numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
*
* Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more
* reading than writing here, and the reading is all coalesced, so it should be OK.
*
* To be used when the stride is 1 and the pooling region is fairly large.
*/
template<class Agg, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
__global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const int numFilters,
const int numImages, const int subsX, const int startX,
const int outputsX, Agg agg) {
__shared__ float shImgs[filtersPerThread][B_X*imgsPerThread];
const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
const int numFilterBlocks = numFilters/(filtersPerThread);
const int blockOutputX = 4*(blockIdx.x / numImgBlocks);
const int blockOutputY = 4*(blockIdx.y / numFilterBlocks);
const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
// const int blockOutputIdx = blockOutputY * outputsX + blockOutputX;
const int numOutputs = outputsX * outputsX;
const int imgPixels = imgSize * imgSize;
const int tidx = threadIdx.y * B_X + threadIdx.x;
const int loadY = tidx / 32, loadX = tidx % 32;
const int myX = threadIdx.y % 4;
const int myY = threadIdx.y / 4;
const int myOutputIdxY = blockOutputY + myY;
const int myOutputIdxX = blockOutputX + myX;
const int myOutputIdx = myOutputIdxY * outputsX + myOutputIdxX;
const int startImgPxX = startX + blockOutputX;
const int startImgPxY = startX + blockOutputY;
const int endImgPxX = startImgPxX + subsX;
const int endImgPxY = startImgPxY + subsX;
const int myStartImgPxY = startImgPxY + myY;
const int myStartImgPxX = startImgPxX + myX;
const int myEndImgPxY = endImgPxY + myY;
const int myEndImgPxX = endImgPxX + myX;
const int loopStartY = MAX(startImgPxY, 0);
const int loopStartX = MAX(startImgPxX, 0);
const int loopEndY = MIN(imgSize, endImgPxY + 3);
const int loopEndX = MIN(imgSize, endImgPxX + 3);
const int imgIdx = blockImgIdx + threadIdx.x;
imgs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
target += (blockFilterIdx * numOutputs + myOutputIdx) * numImages + imgIdx;
float prod[filtersPerThread][imgsPerThread];
#pragma unroll
for (int f = 0; f < filtersPerThread; f++) {
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
prod[f][i] = agg.getBaseValue();
}
}
int regionSize = 0;
for (int y = loopStartY; y < loopEndY; y++) {
const bool isInY = y >= myStartImgPxY && y < myEndImgPxY ;
for (int x = loopStartX; x < loopEndX; x++) {
// Load a pixel
const int px = y * imgSize + x;
#pragma unroll
for (int ly = 0; ly < filtersPerThread; ly += B_X/2) {
if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) {
#pragma unroll
for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) {
if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
shImgs[ly + loadY][lx + loadX] = imgs[(ly * imgPixels + px) * numImages + lx];
}
}
}
}
__syncthreads();
// Is this pixel in my region?
if (isInY && x >= myStartImgPxX && x < myEndImgPxX) {
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
#pragma unroll
for (int f = 0; f < filtersPerThread; f++) {
prod[f][i] = agg(prod[f][i], shImgs[f][threadIdx.x + i * B_X]);
}
}
}
++regionSize;
}
__syncthreads();
}
}
if (myOutputIdxY < outputsX && myOutputIdxX < outputsX) {
#pragma unroll
for (int i = 0; i < imgsPerThread; i++) {
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
#pragma unroll
for (int f = 0; f < filtersPerThread; f++) {
target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize);
}
}
}
}
}
/*
* imgs: (numFilters, imgPixels, numImages)
* target: (numFilters, outputs, numImages)
*/
template<class Pooler>
void convLocalPool(NVMatrix& images, NVMatrix& target, int numFilters,
int subsX, int startX, int strideX, int outputsX, Pooler pooler) {
int numImages = images.getNumCols();
int imgPixels = images.getNumRows() / numFilters;
assert(images.getNumRows() == numFilters * imgPixels);
int imgSize = int(sqrt(imgPixels));
assert(imgSize * imgSize == imgPixels);
assert(!images.isTrans());
assert(!target.isTrans());
assert(images.isContiguous());
// assert(numFilters % 4 == 0);
// assert(numImages % 128 == 0);
cudaStream_t stream = NVMatrix::getDefaultStream();
int outputs = outputsX * outputsX;
target.resize(numFilters*outputs, numImages);
if (strideX == 1 && subsX >= 6 && outputsX > 1) {
// NOTE: this part has not been optimized for Kepler
int imgsPerThread = numImages % 128 == 0 ? 8 : 4;
int filtersPerThread = numFilters % 4 == 0 ? 4 : numFilters % 3 == 0 ? 3 : numFilters % 2 == 0 ? 2 : 1;
int bx = 8;
bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0;
assert((imgsPerThread * bx) % 32 == 0);
assert(numFilters % filtersPerThread == 0);
dim3 threads(bx, 16);
dim3 blocks(DIVUP(outputsX, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(outputsX, 4) * numFilters / filtersPerThread);
// printf("threads: %dx%d, blocks: %dx%d, imgSize: %d, numFilters: %d, numImages: %d, subsX: %d, startX: %d, outputsX: %d\n",
// threads.y, threads.x, blocks.y, blocks.x, imgSize, numFilters, numImages, subsX, startX, outputsX);
if (imgsPerThread == 8) {
if (filtersPerThread == 1) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 1, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 1, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
} else if (filtersPerThread == 2) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 2, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 2, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 2, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
} else if (filtersPerThread == 3) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 3, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 3, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 3, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 3, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
} else if (filtersPerThread == 4) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 4, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 4, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 8, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
}
} else if (imgsPerThread == 4) {
if (filtersPerThread == 1) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 1, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 1, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
} else if (filtersPerThread == 2) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 2, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 2, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 2, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
} else if (filtersPerThread == 3) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 3, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 3, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 3, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 3, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
} else if (filtersPerThread == 4) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 4, true>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 4, false>, cudaFuncCachePreferShared);
kLocalPool2<Pooler, 8, 4, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
}
}
}
} else {
int filtersPerThread = numFilters % 16 == 0 ? 4 : 1;
int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
dim3 threads(32, 4);
dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numFilters, 4 * filtersPerThread) * outputsX);
if (imgsPerThread == 4) {
if (filtersPerThread == 1) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 1, true>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 4, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 1, false>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 4, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
}
} else {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 4, true>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 4, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 4, false>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 4, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
}
}
} else if (imgsPerThread == 2) {
if (filtersPerThread == 1) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 1, true>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 2, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 1, false>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 2, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
}
} else {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 4, true>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 2, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 4, false>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 2, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
}
}
} else {
if (filtersPerThread == 1) {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 1, true>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 1, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 1, false>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 1, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
}
} else {
if (checkCaseBounds) {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 4, true>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 1, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
} else {
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 4, false>, cudaFuncCachePreferL1);
kLocalPool<Pooler, 4, 32, 1, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
}
}
}
}
getLastCudaError("convLocalPool: kernel execution failed");
}
#endif /* CONV_UTIL_CUH */

View File

@ -0,0 +1,197 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef COMMON_CUH
#define COMMON_CUH
#include <helper_cuda.h> // helper functions CUDA error checking and initialization
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "conv_util.cuh"
#include "caffe2/core/context_gpu.h"
enum FILTER_OUTPUT_ORDER { MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE };
void convFilterActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups);
void convFilterActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups,
float scaleTargets,
float scaleOutput);
void localFilterActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups);
void localFilterActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups,
float scaleTargets,
float scaleOutput);
void convImgActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int imgSizeX,
int numModulesY,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups);
void convImgActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int imgSizeX,
int numModulesY,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups,
float scaleTargets,
float scaleOutput);
void localImgActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int imgSizeX,
int numModulesY,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups);
void localImgActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* filters,
caffe2::TensorCUDA* targets,
int imgSizeY,
int imgSizeX,
int numModulesY,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups,
float scaleTargets,
float scaleOutput);
void convWeightActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int filterSize,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups,
int sumWidth);
void convWeightActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int filterSize,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups,
int sumWidth,
float scaleTargets,
float scaleOutput);
void localWeightActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int filterSize,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups);
void localWeightActs(
caffe2::CUDAContext* context,
caffe2::TensorCUDA* images,
caffe2::TensorCUDA* hidActs,
caffe2::TensorCUDA* targets,
int imgSizeY,
int numModulesY,
int numModulesX,
int filterSize,
int paddingStart,
int moduleStride,
int numImgColors,
int numGroups,
float scaleTargets,
float scaleOutput);
#endif /* COMMON_CUH */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,112 @@
################################################################################
#
# Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
#
# NOTICE TO USER:
#
# This source code is subject to NVIDIA ownership rights under U.S. and
# international Copyright laws.
#
# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
# OR PERFORMANCE OF THIS SOURCE CODE.
#
# U.S. Government End Users. This source code is a "commercial item" as
# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
# "commercial computer software" and "commercial computer software
# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
# and is provided to the U.S. Government only as a commercial end item.
# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
# source code with only those rights set forth herein.
#
################################################################################
# Location of the CUDA Toolkit binaries and libraries
CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include
CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin
CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64
# Common binaries
NVCC = $(CUDA_BIN_PATH)/nvcc
GCC = g++
AR = ar
# CUDA code generation flags
GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
GENCODE_FLAGS := $(GENCODE_SM35)
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m64
NVCCFLAGS := -m64
# Debug build flags
ifeq ($(dbg),1)
CCFLAGS += -g
NVCCFLAGS += -g -G
DBG := debug
else
DBG := release
NVCCFLAGS += -O3
CCFLAGS += -O3
endif
# Add profiler output
ifeq ($(prof),1)
NVCCFLAGS += --ptxas-options=-v
endif
TARGETDIR := ./bin/$(DBG)
OBJDIR := ./obj/$(DBG)
########## USER STUFF ###########
PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
MODELNAME := _ConvNet
LDFLAGS += -lpthread -ljpeg -lpython$(PYTHON_VERSION) -L../util -lutilpy -L../nvmatrix -lnvmatrix -L../cudaconv3 -lcudaconv -lcublas -Wl,-rpath=./util -Wl,-rpath=./nvmatrix -Wl,-rpath=./cudaconv3
INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH)
DEFINES := -DNUMPY_INTERFACE
CUFILES := $(shell find . -name "*.cu")
CU_DEPS := $(shell find . -name "*.cuh")
CCFILES := $(shell find . -name "*.cpp")
C_DEPS := $(shell find . -name "*.h")
NVCCFLAGS += --compiler-options '-fPIC'
LDFLAGS += -shared
CCFLAGS += -fPIC
TARGET := $(TARGETDIR)/$(MODELNAME).so
################################################################################
# Set up target and object files
################################################################################
OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
# Target rules
all: makedirs $(TARGET)
$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
$(NVCC) $(DEFINES) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
$(GCC) $(DEFINES) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
$(TARGET): $(OBJS)
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
ln -sf $(TARGET) .
makedirs:
mkdir -p $(TARGETDIR)
mkdir -p $(OBJDIR)/src
clean:
rm -rf ./obj

View File

@ -0,0 +1,66 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ACTBROADCASTER_CUH_H_
#define ACTBROADCASTER_CUH_H_
#include <map>
#include "streambroadcast.cuh"
#include "copypipeline.cuh"
class BroadcastMessage {
public:
enum MESSAGE_TYPE {
BROADCAST,
EXIT
};
protected:
int _srcDevice;
std::map<int, NVMatrix*> _mats;
int _userIdx;
Queue<int>* _finishQueue;
MESSAGE_TYPE _type;
BroadcastMessage(MESSAGE_TYPE type);
public:
BroadcastMessage(std::map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue);
int getSrcDevice();
std::map<int, NVMatrix*>& getMatrices();
int getUserIdx();
Queue<int>& getFinishQueue();
MESSAGE_TYPE getMessageType();
};
class ExitBroadcastMessage : public BroadcastMessage {
public:
ExitBroadcastMessage();
};
class ActBroadcaster : public Thread {
protected:
std::map<int,IBroadcastNetwork*> _broadcasters; // src device --> broadcaster
Queue<BroadcastMessage*> _messageQueue;
int _numUsers;
public:
ActBroadcaster(int numUsers, intv& cpus);
~ActBroadcaster();
Queue<BroadcastMessage*>& getMessageQueue();
virtual void* run();
void stop();
};
#endif /* ACTBROADCASTER_CUH_H_ */

View File

@ -0,0 +1,180 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CONVNET3
#define CONVNET3
#include <vector>
#include <string>
#include <set>
#include <map>
#include <helper_cuda.h>
#include <time.h>
#include "../../util/include/queue.h"
#include "../../util/include/thread.h"
#include <math.h>
#include "../../util/include/sync.h"
#include "messages.cuh"
#include "streambroadcast.cuh"
#include "layer.cuh"
#include "data.cuh"
#include "worker.cuh"
#include "weights.cuh"
#include "pipedispenser.cuh"
#include "timer.cuh"
class Worker;
class WorkResult;
class Layer;
class DataLayer;
class CostLayer;
class ConvNetThread;
class StreamBroadcast;
class Weights;
// name -> device id -> layer*
typedef std::map<std::string,std::map<int, Layer*> > NameReplicaLayerMap;
typedef std::map<std::string, Layer*> NameLayerMap;
// name -> ReplicaMap
//typedef std::map<int,NameLayerMap> ReplicaNameLayerMap;
typedef std::vector<ConvNetThread*> ConvNetThreadV;
typedef std::vector<DataLayer*> DataLayerVector;
//typedef std::map<int,ConvNetThreadV> ReplicaThreadsMap;
class ConvNet : public Thread {
private:
void checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights);
protected:
NameReplicaLayerMap _layerMap;
DataLayerVector _dataLayers;
// Vector of convnet threads (one thread == one GPU)
ConvNetThreadV _convNetThreads;
DataProvider* _dp;
CPUData* _data, *_bufferData;
int _bufferMinibatchIdx, _bufferPassIdx;
ThreadSynchronizer* _sync;
intv _deviceIDs;
Queue<Worker*> _workerQueue;
Queue<WorkResult*> _resultQueue;
Queue<Message*> _msgQueue;
int _numFwdTerminal;
std::map<int, int> _numBwdTerminal; // pass idx -> #terminal
int _totalPassesDone;
int _numReplicasMin, _numReplicasMax;
// For gradient checking
int _numFailures;
int _numTests;
// Training progress (between 0 and 1).
// Used to determine learning rate based on ParameterSchedule.
double _trainingProgress;
double _baseErr;
bool _conserveMem;
PipeDispenser *_dataCopyPD;
void waitForTerminals(int numMsgs, MESSAGES msg);
void sendMessage(MESSAGES msg, bool sync);
void sendMessage(Message* msg, bool sync);
void findBwdTerminal(Layer& l, std::set<Layer*>& visited, int& terminal, int passIdx);
void connectReplicas();
void initDataLayers(PyObjectV* layerList);
void initGPUThreads(PyObjectV* layerList);
void connectChildren(PyObject* layerParams);
void* run();
void setData(CPUData& data, int passIdx);
void setDataFromBuffer();
void setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx);
public:
ConvNet(PyObject* layerParams, intv& deviceIDs,
int minibatchSize, bool conserveMem);
~ConvNet();
void stop();
Queue<Message*>& getMessageQueue();
Queue<Worker*>& getWorkerQueue();
Queue<WorkResult*>& getResultQueue();
DataProvider& getDataProvider();
Layer& getLayer(std::string& name, int replicaID);
void copyToCPU();
void copyToGPU();
void updateWeights(int passIdx);
void reset(int passIdx);
void reset();
void bprop(int passIdx, PASS_TYPE passType);
void fprop(int miniIdx, int passIdx, PASS_TYPE passType);
void fprop(CPUData& data, int passIdx, PASS_TYPE passType);
void setTrainingProgress(double progress);
double getTrainingProgress() const;
bool checkGradient(const std::string& name, float eps, Weights& weights);
void checkGradients();
Cost& getCost();
Cost& getCost(Cost& cost);
CPUData& getData(); // Returns last minibatch fpropped
double getCostValue();
intv& getDeviceIDs();
ThreadSynchronizer& getSync();
void syncWithChildren();
int getMinibatchSize();
bool isConserveMemory();
int getNumReplicasMax();
int getNumReplicasMin();
int getNumPasses();
int getTotalPassesDone();
PipeDispenser& getDataCopyPD();
};
class ConvNetThread : public Thread {
protected:
NameLayerMap _nameLayerMap;
std::vector<CostLayer*> _costs;
ConvNet* _convNet;
int _deviceID;
Queue<Message*> _msgQueue;
Timer _timer;
// StreamBroadcast* _weightSynchronizer;
void initCuda();
virtual void initLayer(PyObject* paramsDict, int replicaID);
void* run();
public:
ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet);
~ConvNetThread();
NameLayerMap& getLayerMap();
int getDeviceID();
ConvNet& getConvNet();
Queue<Message*>& getMessageQueue();
std::vector<CostLayer*>& getCostLayers();
// StreamBroadcast& getWeightSynchronizer();
Cost& getCost();
Layer& getLayer(std::string& name);
void startTimer();
double stopTimer();
};
#endif /* CONVNET */

View File

@ -0,0 +1,218 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef COPYPIPELINE_CUH_
#define COPYPIPELINE_CUH_
#include <set>
#include "../../util/include/thread.h"
#include "../../util/include/queue.h"
#include <helper_cuda.h>
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "util.cuh"
#define COPY_MIN_CHUNK_SIZE (1<<18) // 256k
#define COPY_MAX_CHUNKS 16
#define COPY_MIN_CHUNKS 2
class CopyPeer;
class CopySource;
class ICopySegment;
class IBroadcastNetwork;
class CopyMessage {
protected:
std::map<int,NVMatrix*>* _mats;
float _scaleSource, _scaleTargets;
public:
enum COPY_MESSAGE_TYPE {
COPY_CHUNK,
COPY_START,
EXIT
};
CopyMessage(COPY_MESSAGE_TYPE msgType, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
: _msgType(msgType), _scaleSource(scaleSource), _scaleTargets(scaleTargets), _mats(&mats) {
}
CopyMessage(COPY_MESSAGE_TYPE msgType)
: _msgType(msgType), _scaleSource(0), _scaleTargets(0), _mats(NULL) {
}
inline COPY_MESSAGE_TYPE getType() const {
return _msgType;
}
inline NVMatrix& getMatrix(int deviceID) const {
return *_mats->at(deviceID);
}
inline std::map<int,NVMatrix*>& getMatrices() const {
return *_mats;
}
inline float getScaleSource() const {
return _scaleSource;
}
inline float getScaleTargets() const {
return _scaleTargets;
}
protected:
COPY_MESSAGE_TYPE _msgType;
};
class CopyChunkMessage : public CopyMessage {
protected:
int _chunkIdx;
int _chunkSize;
int _numChunks;
public:
CopyChunkMessage(int chunkIdx, int chunkSize, int numChunks, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
: _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), CopyMessage(COPY_CHUNK, scaleSource, scaleTargets, mats) {
}
inline int getChunkIdx() const {
return _chunkIdx;
}
inline int getChunkSize() const {
return _chunkSize;
}
inline int getNumChunks() const {
return _numChunks;
}
};
class CopyStartMessage : public CopyMessage {
public:
CopyStartMessage(float scaleSource, float scaleTargets, std::map<int,NVMatrix*>& mats) : CopyMessage(COPY_START, scaleSource, scaleTargets, mats) {
}
};
class ICopySegment : public Thread {
protected:
int _deviceID, _execDeviceID;
cudaStream_t _stream;
ICopySegment* _prev;
std::vector<CopyPeer*> _next;
Queue<CopyMessage*> _queue;
Queue<int>* _finishQueue;
HostNVMatrix _hmat;
IBroadcastNetwork* _parent;
NVMatrix& getChunk(NVMatrix& mat, int chunkSize, int chunkIdx);
void* run();
virtual bool processMessage(CopyMessage& msg) = 0;
public:
ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
virtual ~ICopySegment();
inline NVMatrix& getMatrix(CopyMessage& msg);
Queue<CopyMessage*>& getQueue();
inline int getDeviceID();
void addPrev(ICopySegment& c);
void addNext(CopyPeer& c);
bool isTerminal() const;
virtual bool isSource() const = 0;
};
class CopySource : public ICopySegment {
protected:
bool processMessage(CopyMessage& msg);
public:
CopySource(IBroadcastNetwork& parent, int deviceID);
inline bool isSource() const;
};
class CopyPeer : public ICopySegment {
protected:
bool processMessage(CopyMessage& msg);
public:
CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
inline bool isSource() const;
};
class IBroadcastNetwork {
protected:
Queue<int> _finishQueue;
CopySource* _src;
std::vector<CopyPeer*> _peers;
int _srcDeviceID, _numTerminal;
bool _constructed;
std::set<int> _devices;
std::pair<std::vector<int>,std::vector<int> > makeGPULists();
void makePeers(std::pair<std::vector<int>,std::vector<int> >& gpus);
virtual void makeConnections() = 0;
virtual void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
IBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
public:
virtual IBroadcastNetwork& construct();
virtual ~IBroadcastNetwork();
virtual void broadcast(std::map<int, NVMatrix*>& mats);
int getSourceDeviceID() const;
static IBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
};
class ISafeBroadcastNetwork : public IBroadcastNetwork {
protected:
ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
public:
virtual void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
virtual ISafeBroadcastNetwork& construct();
static ISafeBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
};
class NullBroadcaster : public ISafeBroadcastNetwork {
protected:
NullBroadcaster(std::set<int>& devices, int srcDeviceID);
void makeConnections();
public:
NullBroadcaster& construct();
void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
void broadcast(std::map<int, NVMatrix*>& mats);
friend class IBroadcastNetwork;
friend class ISafeBroadcastNetwork;
};
/*
* This one goes to host and then to targets.
*/
class NaiveBroadcaster : public ISafeBroadcastNetwork {
protected:
NaiveBroadcaster(std::set<int>& devices, int srcDeviceID);
void makeConnections();
friend class IBroadcastNetwork;
friend class ISafeBroadcastNetwork;
};
class EightGPUBroadcaster1 : public IBroadcastNetwork {
protected:
EightGPUBroadcaster1(std::set<int>& devices, int srcDeviceID);
void makeConnections();
friend class IBroadcastNetwork;
};
class TwoPeeringGPUsBroadcaster : public ISafeBroadcastNetwork {
protected:
int _tgtDeviceID;
cudaStream_t _tgtStream;
void makeConnections();
void resetDeviceID(int d);
void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
public:
TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID);
~TwoPeeringGPUsBroadcaster();
ISafeBroadcastNetwork& construct();
friend class IBroadcastNetwork;
friend class ISafeBroadcastNetwork;
};
#endif /* COPYPIPELINE_CUH_ */

View File

@ -0,0 +1,56 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef COST_CUH
#define COST_CUH
#include <vector>
#include <map>
#include <helper_cuda.h>
#include "layer.cuh"
#include "util.cuh"
class CostLayer;
/*
* Wrapper for dictionary mapping cost name to vector of returned values.
*/
class Cost {
protected:
std::map<std::string,int> _numCases;
CostMap _costMap;
CostCoeffMap _costCoeffMap;
std::map<std::string,int>& getNumCasesMap();
public:
Cost();
Cost(std::vector<CostLayer*>& costs);
doublev& operator [](const std::string s);
CostMap& getCostMap();
CostCoeffMap& getCostCoeffMap();
int getNumCases();
/*
* Returns sum of first values returned by all the CostLayers, weighted by the cost coefficients.
*/
double getValue();
Cost& operator += (Cost& er);
virtual ~Cost();
void print();
};
#endif /* COST_CUH */

View File

@ -0,0 +1,101 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATA_CUH
#define DATA_CUH
#include <vector>
#include <algorithm>
#include "util.cuh"
class CPUData {
protected:
MatrixV* _data;
void assertDimensions() {
assert(_data->size() > 0);
for (int i = 1; i < _data->size(); i++) {
assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols());
if (_data->at(i-1)->isTrans() != _data->at(i)->isTrans() && _data->at(i)->getNumElements() < 2) {
_data->at(i)->setTrans(_data->at(i-1)->isTrans());
}
assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans());
}
assert(_data->at(0)->getNumCols() > 0);
}
public:
typedef typename MatrixV::iterator T_iter;
// Cases in columns, but array may be transposed
// (so in memory they can really be in rows -- in which case the array is transposed
// during the copy to GPU).
CPUData(PyObject* pyData) {
_data = getMatrixV(pyData);
assertDimensions();
}
CPUData(MatrixV* data) : _data(data) {
assertDimensions();
}
~CPUData() {
for (T_iter it = _data->begin(); it != _data->end(); ++it) {
delete *it;
}
delete _data;
}
Matrix& operator [](int idx) const {
return *_data->at(idx);
}
int getSize() const {
return _data->size();
}
MatrixV& getData() const {
return *_data;
}
Matrix& getData(int i) const {
return *_data->at(i);
}
bool isTrans() const {
return _data->at(0)->isTrans();
}
int getNumCases() const {
return _data->at(0)->getNumCols();
}
};
class DataProvider {
protected:
CPUData* _hData;
NVMatrixV _data;
int _minibatchSize;
public:
DataProvider(int minibatchSize);
void setData(CPUData&);
void clearData();
CPUData& getMinibatch(int idx);
CPUData& getDataSlice(int startCase, int endCase);
int getNumMinibatches();
int getMinibatchSize();
int getNumCases();
};
#endif /* DATA_CUH */

View File

@ -0,0 +1,88 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef GRADREDUCER_CUH_
#define GRADREDUCER_CUH_
#include <set>
#include <algorithm>
#include "streambroadcast.cuh"
#include "reducepipeline.cuh"
#include "layer.cuh"
#include "util.cuh"
class StreamBroadcast;
class Layer;
#define ACT_GRAD_REDUCER_EXIT (1 << 16)
//class ReduceMessage {
// ReduceMessage();
// ReduceMessage(bool exit);
//};
class IActGradReducer : public Thread {
protected:
Layer* _parent;
Queue<int> _finishQueue;
int _numExpectedMsgsTotal;
std::map<int,int> _numExpectedMsgs; // map from device id -> num expected msgs
void* run();
virtual bool reduce() = 0;
virtual void reset() = 0;
public:
IActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
virtual ~IActGradReducer();
int waitForFinish();
virtual void enqueueReduction(int deviceID) = 0;
virtual void stop() = 0;
static IActGradReducer& makeGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
};
class SequentialActGradReducer : public IActGradReducer {
protected:
std::map<int,int> _numReceivedMsgs; // map from device id -> num received msgs
std::map<int,Queue<int>* > _messageQueues;
intv _deviceIDs;
StreamBroadcast* _broadcaster;
bool reduce();
void reset();
public:
SequentialActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
~SequentialActGradReducer();
void enqueueReduction(int deviceID);
void stop();
};
class ParallelActGradReducer : public IActGradReducer {
protected:
IEightGPUReducer* _reducer;
int _numReceivedMsgs;
float _scaleTarget;
Queue<int> _messageQueue;
bool reduce();
void reset();
public:
ParallelActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
void enqueueReduction(int deviceID);
void stop();
};
#endif /* GRADREDUCER_CUH_ */

View File

@ -0,0 +1,61 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef JPEG_MAIN_H
#define JPEG_MAIN_H
#include <cstdio>
#include <cstdlib>
#include <Python.h>
#include <vector>
#include <string>
#include <iostream>
#include <jpeglib.h>
//#include <arrayobject.h>
#include "../../util/include/thread.h"
#include "../../util/include/matrix.h"
#ifndef DIVUP
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
#endif
#define NUM_JPEG_DECODER_THREADS 4
class DecoderThread : public Thread {
protected:
PyObject* _pyList;
Matrix* _target;
int64 _start_img, _end_img;
int64 _img_size, _inner_size, _inner_pixels;
bool _test, _multiview;
unsigned char* _decodeTarget;
int64 _decodeTargetSize;
unsigned int _rseed;
void* run();
void decodeJpeg(int idx, int& width, int& height);
double randUniform();
double randUniform(double min, double max);
void crop(int64 i, int64 width, int64 height, bool flip);
virtual void crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y);
public:
DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview);
virtual ~DecoderThread();
};
#endif // JPEG_MAIN_H

View File

@ -0,0 +1,812 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LAYER_CUH
#define LAYER_CUH
#include <algorithm>
#include <string>
#include <vector>
#include <map>
#include <assert.h>
#include <helper_timer.h>
#include "../../nvmatrix/include/nvmatrix.cuh"
//#include "experimental/akrizhevsky/g3/mactruck-gpu-tests/gpu_util.cuh"
#include "weights.cuh"
#include "convnet.cuh"
#include "cost.cuh"
#include "neuron.cuh"
#include "data.cuh"
#include "layer_kernels.cuh"
#include "streambroadcast.cuh"
#include "actbroadcaster.cuh"
#include "gradreducer.cuh"
#include "util.cuh"
#include "timer.cuh"
#include "memorysource.cuh"
class Cost;
class ConvNet;
class ConvNetThread;
class CostLayer;
class DataLayer;
class Layer;
class ActBroadcaster;
class BroadcastMessage;
class IActGradReducer;
class Weights;
class WeightList;
typedef std::vector<Layer*> LayerV;
class BinomialCrossEntOperator {
protected:
float _posWeight;
public:
BinomialCrossEntOperator(float posWeight) : _posWeight(posWeight) {
}
__device__ inline float operator()(const float t, const float y) const {
return _posWeight * t * safelog(y) + (1.0f - t) * safelog(1.0f - y);
}
};
class CrossEntOperator {
protected:
float _posWeight;
public:
CrossEntOperator(float posWeight) : _posWeight(posWeight) {
}
__device__ inline float operator()(const float t, const float y) const {
return _posWeight * t * safelog(y);
}
};
/*
* Abstract layer.
*/
class Layer {
protected:
ConvNetThread* _convNetThread;
// This is a vector[#layers_next]
std::vector<Layer*> _next;
// This is a vector[#replicas_prev][#layers_prev]
std::map<int, std::vector<Layer*> > _prev;
int _rcvdFInputMsgs;
std::map<int, int> _numComputedActsGrads;
int _rcvdBInputMsgs;
int _numOutputs;
std::map<int, NVMatrix*> _inputs; // input idx -> matrix
std::map<int, MemoryView*> _memSrcActs; // device id -> memory source
std::map<int, MemoryView*> _memSrcActsGrad; // device id -> memory source
bool _gradConsumer, _foundGradConsumers, _trans;
std::map<int,bool> _bwdTerminal; // One bool per pass
int _numGradProducersNext;
int _actsTarget, _actsGradTarget;
std::string _name, _type;
intv _nextDeviceIDs, _prevDeviceIDs;
HostNVMatrix _hostMemFwd;
// New replica-related stuff:
std::map<int,Layer*> _replicas; // NOTE: a layer is its own sibling, too
// Previous layers sorted by device ID, in reverse order in which they are procesed by
// sequential grad reducer. map from replica -> device id -> layers
std::map<int,std::map<int,std::set<Layer*> > > _prevByDevice;
std::map<std::string, int> _inputIndices;
int _replicaID;
int _numReplicas;
int _numReplicasPrev, _numReplicasNext;
Queue<int> _broadcastFinishQueue;
Queue<int> _reductionFinishQueue;
ActBroadcaster* _actBroadcaster;
IActGradReducer* _gradReducer;
Timer _timer;
bool _initialized;
virtual void fpropNext(PASS_TYPE passType, int passIdx);
virtual void truncBwdActs();
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) = 0;
virtual void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) {
// Do nothing by default
}
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
assert(!isGradProducer()); // Only do nothing if not grad producer
}
virtual void fpropCommon(PASS_TYPE passType) {
}
void bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx);
ActBroadcaster& getActBroadcaster();
IActGradReducer& getGradReducer();
int getInputIdx(std::string& parentName);
void setInputIdx(std::string& parentName, int idx);
public:
static bool _saveActsGrad, _saveActs;
Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
virtual ~Layer();
virtual bool fprop(PASS_TYPE passType, int passIdx);
void fprop(NVMatrix& v, int inpIdx, PASS_TYPE passType, int passIdx);
virtual void fprop(std::map<int,NVMatrix*>& v, PASS_TYPE passType, int passIdx);
virtual void bprop(PASS_TYPE passType, int passIdx);
virtual void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
virtual void reset();
virtual void resetPassIdx();
int getNumCases(NVMatrix& v);
int& getNumComputedActsGrads(int deviceID);
int incRcvdBInputMsgs();
bool isGradConsumer();
bool hasGradProducerNext(std::string& layerName);
// Does this layer produce a gradient for any layer?
virtual bool isGradProducer();
// Does this layer produce a gradient for layer of given name?
virtual bool isGradProducer(std::string& layerName);
std::string& getName();
std::string& getType();
virtual void addNext(Layer& l);
virtual void addPrev(Layer& l, int replicaIdx);
virtual void addReplica(Layer& l);
std::map<int,std::vector<Layer*> >& getPrev();
std::vector<Layer*>& getNext();
virtual NVMatrix& getActs();
virtual NVMatrix& getActs(int deviceID);
virtual NVMatrix& getActs(int deviceID, int numCases);
virtual NVMatrix& getActsGrad();
virtual NVMatrix& getActsGrad(int deviceID);
virtual std::map<int,NVMatrix*> getAllActs();
virtual std::map<int, NVMatrix*> getAllActsGrads();
virtual bool postInit();
int getDeviceID();
ConvNetThread& getConvNetThread();
cudaStream_t getStream();
void syncStream();
void setBwdTerminal(int passIdx);
// Do nothing if this layer has no weights
virtual bool updateWeights() {
return false;
}
virtual bool constrainWeights() {
return false;
}
virtual void checkGradient() {
}
virtual void copyToCPU() {
}
virtual void copyToGPU() {
}
intv& getNextDeviceIDs() {
return _nextDeviceIDs;
}
int getReplicaID();
int getNumReplicas();
int getNumSiblingReplicas();
int getNumReplicasPrev();
int getNumReplicasNext();
int getNumOutputs();
void setMemorySourceActs(int deviceID, MemoryView& mem);
void setMemorySourceActsGrad(int deviceID, MemoryView& mem);
MemoryView& getMemorySourceActs(int deviceID);
MemoryView& getMemorySourceActsGrad(int deviceID);
int getFwdActiveInputReplicaIdx(int passIdx);
int getBwdActiveInputReplicaIdx(int passIdx);
int getFwdActiveReplicaIdx(int passIdx);
int getNumLayersPrev();
virtual int getNumInputReplicas();
int getNumExpectedBwdMsgs();
int getNumExpectedFwdMsgs();
int getReplicaIdx();
int getActivePassPeriod();
int getNumGradProducersNext();
virtual ConvNet& getConvNet();
};
class TwoDLayerInterface {
protected:
int _channels, _imgSize, _imgPixels;
public:
TwoDLayerInterface(PyObject* paramsDict);
};
class NeuronLayer : public Layer {
protected:
Neuron* _neuron;
std::string _neuronType;
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
virtual bool bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
class CrossEntLogisticGradientOperator {
private:
float _coeff, _posWeight;
public:
CrossEntLogisticGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
}
__device__ inline float operator()(const float y, const float t) const {
return _coeff * (_posWeight * t * (1.0f - y) + (t - 1.0f) * y);
}
};
NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
~NeuronLayer();
std::string& getNeuronType();
};
class WeightLayer : public Layer {
protected:
WeightList* _weights;
Weights *_biases;
NVMatrix _norm2;
float _wStep, _bStep;
int _weightUpdatePassPeriod;
void fpropCommon(PASS_TYPE passType);
void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType);
virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0;
virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) = 0;
virtual void _constrainWeights();
virtual float getGradScale(int inpIdx, PASS_TYPE passType);
virtual float getIncScale(int inpIdx, PASS_TYPE passType);
virtual float getBGradScale(PASS_TYPE passType);
virtual float getBIncScale();
virtual NVMatrix& getGradTarget(int inpIdx);
NVMatrix& getWeightMatrix(PASS_TYPE passType, int inpIdx);
NVMatrix& getBiasMatrix(PASS_TYPE passType);
public:
WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad);
virtual ~WeightLayer();
virtual bool updateWeights();
virtual bool constrainWeights();
virtual void copyToCPU();
virtual void copyToGPU();
virtual void checkGradient();
Weights& getWeights(int idx);
void addReplica(Layer& l);
virtual bool postInit();
};
class FCLayer : public WeightLayer {
protected:
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType);
virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
virtual void _constrainWeights();
public:
FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
FCLayer();
};
class SplitFCLayer : public FCLayer {
protected:
int _numParts;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
// void bpropBiases(NVMatrix& v, PASS_TYPE passType);
void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
void splitWeights();
public:
SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
};
class SoftmaxLayer : public Layer {
protected:
bool _doUpperGrad;
NVMatrix _max, _sum;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
void setDoUpperGrad(bool b);
};
class ConcatenationLayer : public Layer {
protected:
intv* _copyOffsets;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
virtual ~ConcatenationLayer();
};
class PassThroughLayer : public Layer {
protected:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
virtual bool postInit();
};
class EltwiseSumLayer : public Layer {
protected:
floatv* _coeffs;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
~EltwiseSumLayer();
};
class EltwiseMaxLayer : public Layer {
protected:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class SumLayer : public Layer {
protected:
int _stride;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
SumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class DataCopyMessage {
public:
enum MESSAGE_TYPE {
COPY,
EXIT
};
protected:
CPUData* _cpuData;
int _passIdx;
bool _other;
DataCopyMessage::MESSAGE_TYPE _type;
DataCopyMessage(DataCopyMessage::MESSAGE_TYPE type) : _cpuData(NULL), _other(false), _passIdx(0), _type(type) {
}
public:
DataCopyMessage(CPUData& cpuData, bool other, int passIdx) : _cpuData(&cpuData), _other(other), _passIdx(passIdx), _type(DataCopyMessage::COPY) {
}
CPUData& getData() const {
return *_cpuData;
}
int getPassIdx() const {
return _passIdx;
}
bool isOther() const {
return _other;
}
DataCopyMessage::MESSAGE_TYPE getType() {
return _type;
}
};
class DataCopyExitMessage : public DataCopyMessage {
public:
DataCopyExitMessage() : DataCopyMessage(DataCopyMessage::EXIT) {
}
};
class DataCopyThread;
class DataLayer : public Layer {
protected:
bool _useBuffer;
int _dataIdx;
ConvNet* _convNet;
// std::map<int, NVMatrix*> _outputs2; // Buffer for copying data during computation
std::map<int, MemoryView*> _memSrcActs2; // // Buffer for copying data during computation
std::map<int, cudaStream_t> _copyStreams;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
Queue<int> _copyFinishQueue;
DataCopyThread* _copier;
bool _outstandingCopyRequest;
int _start, _end;
public:
void fprop(PASS_TYPE passType, int passIdx, bool fromBuffer);
DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID);
~DataLayer();
NVMatrix& getActs(int deviceID);
// NVMatrix& getActs(int deviceID, bool other);
NVMatrix& getActs(int deviceID, bool other, int numCases);
bool isGradProducer();
void toggleBuffer(int passIdx);
void copyData(CPUData& data, bool other, int passIdx);
bool postInit();
ConvNet& getConvNet();
int getNumInputReplicas();
cudaStream_t getCopyStream(int deviceID);
Queue<int>& getCopyFinishQueue() {
return _copyFinishQueue;
}
void waitForCopyFinish();
int getDataIdx() const {
return _dataIdx;
}
int getStart() const {
return _start;
}
int getEnd() const {
return _end;
}
};
class DataCopyThread : public Thread {
protected:
DataLayer* _parent;
Queue<DataCopyMessage*> _queue;
HostNVMatrix _hostMemFwd;
Timer _requestTimer;
int _sleepUsec;
virtual void* run();
public:
DataCopyThread(DataLayer& parent, intv& cpus);
Queue<DataCopyMessage*>& getQueue();
void stop();
};
class LocalLayer : public WeightLayer {
protected:
intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups;
intv* _imgPixels, *_filterPixels, *_filterChannels;
int _modulesX, _modules, _numFilters;
public:
LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
virtual ~LocalLayer();
};
class ConvLayer : public LocalLayer {
protected:
int _sumWidth;
bool _sharedBiases;
floatv* _weightContrastNormMin, *_weightContrastNormMax;
NVMatrix _weightGradTmp;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
void bpropBiases(NVMatrix& v, PASS_TYPE passType);
void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
void truncBwdActs();
void _constrainWeights();
public:
ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
virtual ~ConvLayer();
};
class LocalUnsharedLayer : public LocalLayer {
protected:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
void bpropBiases(NVMatrix& v, PASS_TYPE passType);
void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
void _constrainWeights();
public:
LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class PoolLayer : public Layer, public TwoDLayerInterface {
protected:
int _sizeX, _start, _stride, _outputsX;
std::string _pool;
public:
PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
static PoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class AvgPoolLayer : public PoolLayer {
protected:
bool _sum;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class MaxPoolLayer : public PoolLayer {
protected:
bool _abs;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs);
};
class CrossMapPoolLayer : public Layer, public TwoDLayerInterface {
protected:
int _size, _start, _stride, _outputs;
std::string _pool;
public:
CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
static CrossMapPoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class CrossMapMaxPoolLayer : public CrossMapPoolLayer {
protected:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class RandomScaleLayer : public Layer, public TwoDLayerInterface {
protected:
int _tgtSize, _minScaledSize;
float _maxScale; // should be >= 1
NVMatrix _rescaledActs;
std::vector<double> _scaleProbs;
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class CropLayer : public Layer, public TwoDLayerInterface {
protected:
int _tgtSize, _startX, _startY;
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class NailbedLayer : public Layer, public TwoDLayerInterface {
protected:
int _start, _stride, _outputsX;
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class GaussianBlurLayer : public Layer, public TwoDLayerInterface {
protected:
Matrix* _hFilter;
NVMatrix _filter;
NVMatrix _actGradsTmp;
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
void copyToGPU();
GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
~GaussianBlurLayer();
};
class HorizontalReflectionLayer : public Layer, public TwoDLayerInterface {
protected:
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID);
};
class ResizeLayer : public Layer, public TwoDLayerInterface {
protected:
float _scale;
int _tgtSize;
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class DropoutLayer : public Layer {
protected:
bool _enable;
float _keep;
NVMatrix _keepMask;
public:
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
void truncBwdActs();
DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
class DropoutSmallerThanOperator {
private:
float _keep, _scale;
public:
DropoutSmallerThanOperator(float keep) : _keep(keep), _scale(1.0f/keep) {
}
__device__ inline float operator()(const float x) const {
return (x < _keep) * _scale;
}
};
};
class Dropout2Layer : public DropoutLayer {
protected:
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class RGBToYUVLayer : public Layer {
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class RGBToLABLayer : public Layer {
protected:
bool _center;
public:
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class ResponseNormLayer : public Layer, public TwoDLayerInterface {
protected:
int _size;
float _scale, _pow;
float _minDiv;
NVMatrix _denoms;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
void truncBwdActs();
public:
ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class CrossMapResponseNormLayer : public ResponseNormLayer {
protected:
bool _blocked;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class ContrastNormLayer : public ResponseNormLayer {
protected:
NVMatrix _meanDiffs;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
void truncBwdActs();
public:
ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class CostLayer : public Layer {
protected:
float _coeff;
doublev _costv;
NVMatrix _tmpbuf; // For error accumulation
int _numCases; // number of cases that the values in _costv were computed on
bool _aggregated;
void fpropCommon(PASS_TYPE passType);
public:
CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
bool fprop(PASS_TYPE passType, int passIdx);
int getNumCases();
virtual doublev& getCost();
float getCoeff();
bool isGradProducer();
void setSendTerminalMessages(bool send);
void resetPassIdx();
static CostLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID);
};
/*
* Input 0: labels
* Input 1: softmax outputs
*/
class CrossEntCostLayer : public CostLayer {
protected:
NVMatrix _trueLabelLogProbs, _correctProbs;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
/*
* Input 0: labels
* Input 1: softmax outputs
*/
class LogregCostLayer : public CostLayer {
protected:
NVMatrix _trueLabelLogProbs, _correctProbs, _topkProbs;
std::map<int,NVMatrix*> _probsAccum; // input replica idx -> nvmatrix
NVMatrix _maxProbs;
std::map<int,int> _numAccumed; // input replica idx -> int
int _topk;
bool _doCompute;
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
NVMatrix& getProbsAccum(int replicaIdx);
};
/*
* Input 0: labels
* Input 1: logistic outputs
*/
class BinomialCrossEntropyCostLayer : public CostLayer {
protected:
bool _computeSoftmaxErrorRate;
NVMatrix _tmpProbs, _tmpVec, _correctProbs;
float _posWeight;
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
float getPosWeight();
// Only for use with non-logistic units
class BinomialCrossEntGradientOperator {
private:
float _coeff, _posWeight;
public:
BinomialCrossEntGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
}
__device__ inline float operator()(const float t, const float y) const {
return _coeff * (_posWeight * __fdividef(t, y) + __fdividef(t - 1.0f, 1.0f - y));
}
};
};
/*
* Input 0: labels
* Input 1: logistic outputs
*/
class DetectionCrossEntropyCostLayer : public BinomialCrossEntropyCostLayer {
protected:
Matrix _hNumPositive, _hNumTruePositive, _hNumDeclaredPositive;
NVMatrix _numPositive, _numTrueNegative, _numTruePositive, _numDeclaredPositive;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
public:
DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
class SumOfSquaresCostLayer : public CostLayer {
protected:
NVMatrix _tmp;
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
public:
SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
};
#endif /* LAYER_CUH */

View File

@ -0,0 +1,88 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LAYER_KERNELS_CUH
#define LAYER_KERNELS_CUH
#include <vector>
#include <helper_cuda.h>
#include "../../nvmatrix/include/nvmatrix.cuh"
#define LOGREG_GRAD_THREADS_X 32
#define LOGREG_GRAD_THREADS_Y 4
#define LOGREG_ERR_THREADS_X 128
#define LOGREG_ERR_THREADS_Y 1
__device__ inline float safelog(const float x) {
return x > 0.0f ? __logf(x) : -50.0f;
}
// The input matrix here is the squared norm.
// This replaces the squared norm with:
// 1 if it is below the threshold given by norm2
// norm/sqrt(a) otherwise -- i.e. the desired norm (not squared)
class MaxWeightConstraintOperator {
private:
float _norm, _norm2;
public:
MaxWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
}
__device__ inline float operator()(const float a) const {
return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f;
}
};
class HardWeightConstraintOperator {
private:
float _norm, _norm2;
public:
HardWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
}
__device__ inline float operator()(const float a) const {
return __fdividef(_norm, sqrtf(a));
}
};
class WeightContrastNormOperator {
private:
float _min, _max, _scale;
public:
WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) {
}
__device__ inline float operator()(float a) const {
a = sqrtf(a) * _scale;
return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f;
}
};
void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad);
void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
// Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad
// to avoi dividing and then multiplying by quantities that may be near zero.
void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add);
void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize);
#endif /* LAYER_KERNELS_CUH */

View File

@ -0,0 +1,74 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LR_CUH
#define LR_CUH
#include <string>
#include <vector>
#include <iostream>
#include <helper_cuda.h>
#include <assert.h>
#include <Python.h>
#include "util.cuh"
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "../../util/include/matrix.h"
/*
* The maximum learning rate is _baseRate.
* The minimum learning rate is _baseRate / _tgtFactor.
*
* These classes define annealing schedules that interpolate between these
* two extrema.
*/
class ParameterSchedule {
protected:
double _baseRate;
public:
ParameterSchedule(double base);
virtual double getValue(double progress);
double getBaseValue() const;
virtual ~ParameterSchedule();
static ParameterSchedule& make(PyObject* schedDict);
};
class LinearParameterSchedule : public ParameterSchedule {
protected:
double _finalRate;
public:
LinearParameterSchedule(double base, double tgtFactor);
virtual double getValue(double progress);
};
class ExpParameterSchedule : public ParameterSchedule {
protected:
double _powBase;
public:
ExpParameterSchedule(double baseRate, double tgtFactor);
virtual double getValue(double progress);
};
class DiscreteExpParameterSchedule : public ParameterSchedule {
protected:
std::vector<double> _rates;
public:
DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps);
virtual double getValue(double progress);
};
#endif /* LR_CUH */

View File

@ -0,0 +1,61 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <map>
#include <set>
#include "../../nvmatrix/include/nvmatrix.cuh"
class MemorySource;
class MemoryView {
protected:
MemorySource* _src;
std::string _name;
public:
MemoryView(MemorySource& src, std::string& name);
~MemoryView();
NVMatrix& getMemory(int numCases);
NVMatrix& getMemory();
MemorySource& getMemorySource();
bool isParent();
std::string& getName();
MemoryView& clone(std::string& name);
};
// Remember: PassThroughLayer, and therefore MemorySource, exists on a particular GPU.
class MemorySource {
protected:
// int _inputIdx;
NVMatrix _memory;
int _deviceID;
int _size;
std::map<std::string, std::pair<int,int> > _viewRanges;
std::map<std::string, NVMatrix*> _memoryViews; // input idx --> slice of _memory
std::set<std::string> _truncateRequests;
Lock _lock;
public:
MemorySource(int size, int deviceID);
~MemorySource();
NVMatrix& getMemory(std::string& name, int numCases);
NVMatrix& getMemory(std::string& name);
MemoryView& addUser(std::string& name, std::pair<int,int> range);
MemoryView& addUser(std::string& name);
std::pair<int,int> getRange(std::string& name);
int getSize();
bool truncate(std::string& name);
static MemoryView& make(int size, int deviceID, std::string& parentUser);
};

View File

@ -0,0 +1,128 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MESSAGES_CUH_
#define MESSAGES_CUH_
#include <string>
#include "layer.cuh"
class Layer;
enum MESSAGES { FPROP_TERMINAL,
BPROP_TERMINAL,
BPROP_READY,
FPROP_READY,
SYNC,
COPY_TO_CPU,
COPY_TO_GPU,
UPDATE_WEIGHTS,
CONSTRAIN_WEIGHTS,
RESET,
RESET_PASS_IDX,
COST_COMPUTED,
BPROP_START,
EXIT_CONVNET};
class Message {
protected:
MESSAGES _messageType;
public:
MESSAGES getType() {
return _messageType;
}
virtual Message* clone() {
return new Message(_messageType);
}
Message(MESSAGES messageType) : _messageType(messageType) {
}
virtual ~Message() {
}
};
class PropMessage : public Message {
protected:
Layer *_toLayer;
PASS_TYPE _passType;
int _passIdx;
public:
Layer& getToLayer() {
return *_toLayer;
}
PASS_TYPE getPassType() {
return _passType;
}
int getPassIdx() {
return _passIdx;
}
virtual PropMessage* clone() {
return new PropMessage(*_toLayer, _passType, _passIdx, _messageType);
}
PropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx, MESSAGES msgType)
: _toLayer(&toLayer), _passType(passType), _passIdx(passIdx), Message(msgType) {
}
};
class FpropMessage : public PropMessage {
public:
FpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
: PropMessage(toLayer, passType, passIdx, FPROP_READY) {
}
virtual FpropMessage* clone() {
return new FpropMessage(*_toLayer, _passType, _passIdx);
}
};
class BpropMessage : public PropMessage {
public:
BpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
: PropMessage(toLayer, passType, passIdx, BPROP_READY) {
}
virtual BpropMessage* clone() {
return new BpropMessage(*_toLayer, _passType, _passIdx);
}
};
class BpropStartMessage : public Message {
protected:
PASS_TYPE _passType;
int _passIdx;
public:
PASS_TYPE getPassType() {
return _passType;
}
int getPassIdx() {
return _passIdx;
}
virtual BpropStartMessage* clone() {
return new BpropStartMessage(_passType, _passIdx);
}
BpropStartMessage(PASS_TYPE passType, int passIdx)
: _passType(passType), Message(BPROP_START), _passIdx(passIdx) {
}
};
#endif /* MESSAGES_CUH_ */

View File

@ -0,0 +1,541 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef NEURONS_CUH
#define NEURONS_CUH
#include <Python.h>
#include <assert.h>
#include <string>
#include "../../nvmatrix/include/nvmatrix.cuh"
#include <helper_cuda.h>
template <class GradientOp>
class AddGradientBinaryOperator {
GradientOp _op;
public:
AddGradientBinaryOperator(GradientOp op) : _op(op) {
}
__device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const {
return _op(unitActGrad, unitAct) + target;
}
};
template <class GradientOp>
class AddGradientOperator {
GradientOp _op;
public:
AddGradientOperator(GradientOp op) : _op(op) {
}
__device__ inline float operator()(const float unitActGrad, const float target) const {
return target + _op(unitActGrad);
}
};
/* =======================
* Neuron
* -----------------------
*
* f(x) = x
* =======================
*/
class Neuron {
protected:
bool _activated;
// Inputs and outputs potentially point to the same matrix, depending on the neuron
NVMatrix* _inputs, *_outputs;
virtual void _activate() {
if (_inputs != _outputs) {
_inputs->copy(*_outputs);
}
}
virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
if (&target != &actsGrad) {
actsGrad.copy(target);
}
}
virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
if (&target != &actsGrad) {
target.add(actsGrad);
}
}
public:
Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) {
}
virtual void activate(NVMatrix& inputs, NVMatrix& outputs) {
_activated = true;
_inputs = &inputs;
_outputs = &outputs;
_activate();
}
virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) {
assert(_activated);
if (!add) {
target.resize(actsGrad);
_computeInputGrad(actsGrad, target);
} else {
_addInputGrad(actsGrad, target);
}
}
static Neuron& makeNeuron(PyObject* neuronDict);
};
/* =======================
* LogisticNeuron
* -----------------------
*
* f(x) = 1 / (1 + e^-x)
* =======================
*/
class LogisticNeuron : public Neuron {
protected:
void _activate() {
_inputs->apply(NVMatrixOps::Logistic(), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<LogisticGradientOperator>(LogisticGradientOperator()), *_outputs, target, target);
}
public:
class LogisticGradientOperator {
public:
__device__ inline float operator()(float unitActGrad, float unitAct) const {
return unitActGrad * unitAct * (1.0f - unitAct);
}
};
LogisticNeuron() : Neuron() {
}
};
/* =======================
* LogNeuron
* -----------------------
*
* f(x) = log(eps + x)
* =======================
*/
class LogNeuron : public Neuron {
protected:
float _eps;
void _activate() {
_inputs->apply(LogOperator(_eps), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(LogGradientOperator(_eps), *_inputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<LogGradientOperator>(LogGradientOperator(_eps)), *_inputs, target, target);
}
public:
class LogGradientOperator {
protected:
float _eps;
public:
__device__ inline float operator()(float unitActGrad, float unitInput) const {
return __fdividef(unitActGrad, _eps + unitInput);
}
LogGradientOperator(float eps) : _eps(eps) {
}
};
class LogOperator {
protected:
float _eps;
public:
__device__ inline float operator()(float x) const {
return __logf(_eps + x);
}
LogOperator(float eps) : _eps(eps) {
}
};
LogNeuron(float eps) : _eps(eps), Neuron() {
}
};
/* =======================
* ReluNeuron
* -----------------------
*
* f(x) = max(0, x)
* =======================
*/
class ReluNeuron : public Neuron {
protected:
virtual void _activate() {
_inputs->apply(ReluOperator(), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<ReluGradientOperator>(ReluGradientOperator()), *_outputs, target, target);
}
public:
class ReluOperator {
public:
__device__ inline float operator()(float x) const {
return x < 0.0f ? 0.0f : x;
}
};
class ReluGradientOperator {
public:
__device__ inline float operator()(float unitActGrad, float unitAct) const {
return unitActGrad * (unitAct > 0.0f);
}
};
ReluNeuron() : Neuron() {
}
};
/* =======================
* BoundedReluNeuron
* -----------------------
*
* f(x) = min(a, max(0, x))
* =======================
*/
class BoundedReluNeuron : public Neuron {
protected:
float _a;
void _activate() {
_inputs->apply(BoundedReluOperator(_a), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<BoundedReluGradientOperator>(BoundedReluGradientOperator(_a)), *_outputs, target, target);
}
public:
class BoundedReluOperator {
private:
float _a;
public:
BoundedReluOperator(float a) : _a(a) {
}
__device__ inline float operator()(float x) const {
return x < 0.0f ? 0.0f : x > _a ? _a : x;
}
};
class BoundedReluGradientOperator {
private:
float _a;
public:
BoundedReluGradientOperator(float a) : _a(a) {
}
__device__ inline float operator()(float unitActGrad, float unitAct) const {
return unitActGrad * (unitAct > 0.0f) * (unitAct < _a);
}
};
BoundedReluNeuron(float a) : Neuron(), _a(a) {
}
};
/* =======================
* AbsNeuron
* -----------------------
*
* f(x) = abs(x)
* =======================
*/
class AbsNeuron : public Neuron {
protected:
void _activate() {
assert(_inputs != _outputs);
_inputs->apply(NVMatrixOps::Abs(), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<AbsGradientOperator>(AbsGradientOperator()), *_inputs, target, target);
}
public:
class AbsGradientOperator {
public:
__device__ inline float operator()(float unitActGrad, float unitInput) const {
return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f);
}
};
AbsNeuron() : Neuron() {
}
};
/* =======================
* TanhNeuron
* -----------------------
*
* f(x) = a*tanh(b*x)
* =======================
*/
class TanhNeuron : public Neuron {
protected:
float _a, _b;
void _activate() {
_inputs->apply(TanhOperator(_a, _b), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<TanhGradientOperator>(TanhGradientOperator(_a, _b)), *_outputs, target, target);
}
public:
class TanhOperator {
private:
float _a, _n2b;
public:
TanhOperator(float a, float b) : _a(a), _n2b(-2*b) {
}
virtual __device__ inline float operator()(float x) const {
return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f);
}
};
class TanhGradientOperator {
private:
float _b, _a;
public:
TanhGradientOperator(float a, float b) : _b(b), _a(a) {
}
__device__ inline float operator()(float unitActGrad, float unitAct) const {
// const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f;
// return unitActGrad * _n4ab * (t * (t - 1.0f));
return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a));
}
};
TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
}
};
/* =======================
* DoubleReluNeuron
* -----------------------
*
* f(x) = x - a*tanh(x/a)
* =======================
*/
class DoubleReluNeuron : public Neuron {
protected:
float _a;
void _activate() {
assert(_inputs != _outputs);
_inputs->apply(DoubleReluOperator(_a), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<DoubleReluGradientOperator>(DoubleReluGradientOperator(_a)), *_inputs, target, target);
}
public:
class DoubleReluOperator {
private:
float _a, _n2a;
public:
DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) {
}
virtual __device__ inline float operator()(float x) const {
return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f);
}
};
class DoubleReluGradientOperator {
private:
float _n2a;
public:
DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) {
}
__device__ inline float operator()(float unitActGrad, float unitInput) const {
const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f;
return unitActGrad * (tanh*tanh);
}
};
DoubleReluNeuron(float a) : Neuron(), _a(a) {
}
};
/* =======================
* SoftReluNeuron
* -----------------------
*
* f(x) = log(1 + e^x)
* =======================
*/
class SoftReluNeuron : public Neuron {
protected:
void _activate() {
// assert(_inputs != _outputs);
_inputs->apply(SoftReluOperator(), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(SoftReluGradientOperator(), *_outputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<SoftReluGradientOperator>(SoftReluGradientOperator()), *_outputs, target, target);
}
public:
class SoftReluOperator {
public:
__device__ inline float operator()(float x) const {
// This piece-wise implementation has better numerical stability than
// simply computing log(1 + e^x).
return x > 4.0f ? x : __logf(1.0f + __expf(x));
}
};
class SoftReluGradientOperator {
public:
__device__ inline float operator()(float unitActGrad, float unitOutput) const {
if (unitOutput > 4.0f) {
return unitActGrad;
}
const float f = __expf(-unitOutput);
return unitActGrad * (1.0f - f);
}
};
SoftReluNeuron() : Neuron() {
}
};
/* =======================
* SquareNeuron
* -----------------------
*
* f(x) = x^2
* =======================
*/
class SquareNeuron : public Neuron {
protected:
void _activate() {
assert(_inputs != _outputs);
_inputs->apply(NVMatrixOps::Square(), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<SquareGradientOperator>(SquareGradientOperator()), *_inputs, target, target);
}
public:
class SquareGradientOperator {
public:
__device__ inline float operator()(float unitActGrad, float unitInput) const {
return unitActGrad * 2.0f * unitInput;
}
};
SquareNeuron() : Neuron() {
}
};
/* =======================
* SqrtNeuron
* -----------------------
*
* f(x) = sqrt(x)
* =======================
*/
class SqrtNeuron : public Neuron {
protected:
void _activate() {
_inputs->apply(NVMatrixOps::Sqrt(), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyTernary(AddGradientBinaryOperator<SqrtGradientOperator>(SqrtGradientOperator()), *_outputs, target, target);
}
public:
class SqrtGradientOperator {
public:
__device__ inline float operator()(float unitActGrad, float unitAct) const {
return __fdividef(unitActGrad, 2.0f * unitAct);
}
};
SqrtNeuron() : Neuron() {
}
};
/* =======================
* LinearNeuron
* -----------------------
*
* f(x) = a*x + b
* =======================
*/
class LinearNeuron : public Neuron {
protected:
float _a, _b;
void _activate() {
_inputs->apply(NVMatrixOps::Linear(_a, _b), *_outputs);
}
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.scale(_a, target);
}
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
actsGrad.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_a)), target, target);
}
public:
LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
}
};
#endif /* NEURONS_CUH */

View File

@ -0,0 +1,175 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef PIPEDISPENSER_CUH_
#define PIPEDISPENSER_CUH_
#include <pthread.h>
#include <set>
#include <algorithm>
#include <iterator>
#include "../../util/include/thread.h"
#include "util.cuh"
/*
* PipeDispenser interface
*/
class PipeDispenser {
protected:
int _numPipes;
seti _pipes;
pthread_mutex_t *_mutex;
void lock() {
pthread_mutex_lock(_mutex);
}
void unlock() {
pthread_mutex_unlock(_mutex);
}
virtual void init() {
_mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
pthread_mutex_init(_mutex, NULL);
}
public:
PipeDispenser(const seti& pipes) {
_pipes.insert(pipes.begin(), pipes.end());
init();
}
PipeDispenser(int numPipes) {
for (int i = 0; i < numPipes; ++i) {
_pipes.insert(i);
}
init();
}
virtual ~PipeDispenser() {
pthread_mutex_destroy(_mutex);
free(_mutex);
}
virtual int getPipe(const seti& interested) = 0;
int getPipe(int interested) {
seti tmp;
tmp.insert(interested);
return getPipe(tmp);
}
virtual void freePipe(int pipe) = 0;
};
/*
* This one blocks until there is a free pipe to return.
*/
class PipeDispenserBlocking : public PipeDispenser {
protected:
pthread_cond_t *_cv;
void wait() {
pthread_cond_wait(_cv, _mutex);
}
void broadcast() {
pthread_cond_broadcast(_cv);
}
int getAvailablePipes(const seti& interested, intv& available) {
available.clear();
std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available));
return available.size();
}
virtual void init() {
PipeDispenser::init();
_cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
pthread_cond_init(_cv, NULL);
}
public:
PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) {
init();
}
PipeDispenserBlocking(int numPipes) : PipeDispenser(numPipes) {
init();
}
~PipeDispenserBlocking() {
pthread_cond_destroy(_cv);
free(_cv);
}
int getPipe(const seti& interested) {
lock();
intv avail;
while (getAvailablePipes(interested, avail) == 0) {
wait();
}
int pipe = avail[0];
_pipes.erase(pipe);
unlock();
return pipe;
}
void freePipe(int pipe) {
lock();
_pipes.insert(pipe);
broadcast();
unlock();
}
};
/*
* This one returns the least-occupied pipe.
*/
class PipeDispenserNonBlocking : public PipeDispenser {
protected:
std::map<int,int> _pipeUsers;
public:
PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) {
for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) {
_pipeUsers[*it] = 0;
}
}
int getPipe(const seti& interested) {
lock();
int pipe = -1, users = 1 << 30;
for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) {
if (interested.count(*it) > 0 && _pipeUsers[*it] < users) {
pipe = *it;
users = _pipeUsers[*it];
}
}
if (pipe >= 0) {
_pipeUsers[pipe]++;
}
unlock();
return pipe;
}
void freePipe(int pipe) {
lock();
_pipeUsers[pipe]--;
unlock();
}
};
#endif /* PIPEDISPENSER_CUH_ */

View File

@ -0,0 +1,35 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef PYCONVNET3_CUH
#define PYCONVNET3_CUH
#define _QUOTEME(x) #x
#define QUOTEME(x) _QUOTEME(x)
extern "C" void init_ConvNet();
PyObject* initModel(PyObject *self, PyObject *args);
PyObject* startBatch(PyObject *self, PyObject *args);
PyObject* finishBatch(PyObject *self, PyObject *args);
PyObject* checkGradients(PyObject *self, PyObject *args);
PyObject* syncWithHost(PyObject *self, PyObject *args);
PyObject* startMultiviewTest(PyObject *self, PyObject *args);
PyObject* startFeatureWriter(PyObject *self, PyObject *args);
PyObject* startDataGrad(PyObject *self, PyObject *args);
PyObject* decodeJpeg(PyObject *self, PyObject *args);
#endif

View File

@ -0,0 +1,185 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef REDUCEPIPELINE_CUH_H_
#define REDUCEPIPELINE_CUH_H_
#include "../../util/include/thread.h"
#include "../../util/include/queue.h"
#include <helper_cuda.h>
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "util.cuh"
#define REDUCE_MIN_CHUNK_SIZE (1<<18) // 256k
#define REDUCE_MAX_CHUNKS 16
#define REDUCE_MIN_CHUNKS 2
enum REDUCE_MESSAGE_TYPE {
REDUCE_CHUNK,
REDUCE_START,
EXIT
};
class ReducePeer;
class ReducerSource;
class IReduceSegment;
class IEightGPUReducer;
class ReduceMessage {
protected:
REDUCE_MESSAGE_TYPE _msgType;
float _scaleIntermediates, _scaleTarget;
std::map<int,NVMatrix*>* _mats;
public:
ReduceMessage(REDUCE_MESSAGE_TYPE msgType, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
: _msgType(msgType), _scaleIntermediates(scaleIntermediates), _scaleTarget(scaleTarget), _mats(&mats) {
}
ReduceMessage(REDUCE_MESSAGE_TYPE msgType)
: _msgType(msgType), _scaleIntermediates(0), _scaleTarget(0), _mats(NULL) {
}
inline REDUCE_MESSAGE_TYPE getType() const {
return _msgType;
}
inline float getScaleIntermediates() const {
return _scaleIntermediates;
}
inline float getScaleTarget() const {
return _scaleTarget;
}
inline NVMatrix& getMatrix(int deviceID) const {
return *_mats->at(deviceID);
}
inline std::map<int,NVMatrix*>& getMatrices() const {
return *_mats;
}
};
class ReduceChunkMessage : public ReduceMessage {
protected:
int _chunkIdx;
int _chunkSize;
int _numChunks;
IReduceSegment* _src;
public:
ReduceChunkMessage(IReduceSegment& src, int chunkIdx, int chunkSize, int numChunks, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
: _src(&src), _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks),
ReduceMessage(REDUCE_CHUNK, scaleIntermediates, scaleTarget, mats) {
}
inline int getChunkIdx() const {
return _chunkIdx;
}
inline int getChunkSize() const {
return _chunkSize;
}
inline int getNumChunks() const {
return _numChunks;
}
inline IReduceSegment& getSource() const {
return *_src;
}
};
class ReduceStartMessage : public ReduceMessage {
public:
ReduceStartMessage(float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
: ReduceMessage(REDUCE_START, scaleIntermediates, scaleTarget, mats) {
}
};
class IReduceSegment : public Thread {
protected:
int _deviceID;
std::vector<IReduceSegment*> _prev;
ReducePeer* _next;
Queue<ReduceMessage*> _queue;
Queue<int>* _finishQueue;
NVMatrix& getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx);
void* run();
virtual bool processMessage(ReduceMessage& msg) = 0;
public:
IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
virtual ~IReduceSegment();
inline virtual NVMatrix& getMatrix(ReduceMessage& msg);
Queue<ReduceMessage*>& getQueue();
int getDeviceID() const;
void addPrev(IReduceSegment& c);
void addNext(ReducePeer& c);
bool isTerminal() const;
};
class ReducerSource : public IReduceSegment {
protected:
bool processMessage(ReduceMessage& msg);
public:
ReducerSource(IEightGPUReducer& parent, int deviceID);
};
class ReducePeer : public IReduceSegment {
protected:
std::map<int,cudaStream_t> _streams; // device id -> stream
std::map<int,int> _numInputsReceived; // chunk idx -> num inputs
int _numInputsFinished;
HostNVMatrix _mat;
bool _add;
bool processMessage(ReduceMessage& msg);
inline cudaStream_t getStream(int deviceID);
inline NVMatrix& getMatrix(ReduceMessage& msg);
void hostAdd(const float* src, float* tgt, const int n, const float scaleTgt);
public:
ReducePeer(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
ReducePeer(IEightGPUReducer& parent);
~ReducePeer();
};
class IEightGPUReducer {
protected:
std::vector<ReducerSource*> _sources;
std::vector<ReducePeer*> _peers;
Queue<int> _finishQueue;
int _tgtDeviceID;
virtual void makeConnections(std::vector<int>& same, std::vector<int>&other) = 0;
public:
IEightGPUReducer(int tgtDeviceID);
virtual ~IEightGPUReducer();
IEightGPUReducer& construct();
void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates, float scaleTarget);
void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates);
void reduce(std::map<int, NVMatrix*>& mats);
int getTgtDeviceID() const;
};
class EightGPUReducer1 : public IEightGPUReducer {
protected:
void makeConnections(std::vector<int>& same, std::vector<int>&other);
public:
EightGPUReducer1(int tgtDeviceID);
};
class EightGPUReducer2 : public IEightGPUReducer {
protected:
void makeConnections(std::vector<int>& same, std::vector<int>&other);
public:
EightGPUReducer2(int tgtDeviceID);
};
#endif /* REDUCEPIPELINE_CUH_H_ */

View File

@ -0,0 +1,53 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef STREAMBROADCAST_CUH_
#define STREAMBROADCAST_CUH_
#include <iostream>
#include "../../util/include/queue.h"
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "util.cuh"
class Layer;
//#define NUM_STREAM_COPY_PARTS 4
// This is in 4-byte words, not bytes
#define SB_MIN_CHUNK_SIZE (1<<17)
#define SB_MAX_CHUNKS 16
class StreamBroadcast {
protected:
std::map<int,cudaStream_t> _streams;
std::set<int> _ownedStreams;
HostNVMatrix _hostMem;
void toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice);
void toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput);
void init(std::map<int,cudaStream_t>& streams);
void init(std::map<int,NVMatrix*>& mats);
public:
StreamBroadcast(std::map<int,cudaStream_t>& streams);
StreamBroadcast();
virtual ~StreamBroadcast();
void transfer(std::map<int,NVMatrix*>& mats, HostNVMatrix& hostmem, int srcDevice, float scaleTarget, float scaleOutput);
void transfer(std::map<int,NVMatrix*>& mats, int srcDevice, float scaleTarget, float scaleOutput);
void transfer(std::map<int,NVMatrix*>& mats, int srcDevice);
void sync(int deviceID);
cudaStream_t getStream(int deviceID);
};
#endif /* STREAMBROADCAST_CUH_ */

View File

@ -0,0 +1,52 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TIMER_CC_H_
#define TIMER_CC_H_
#include <helper_timer.h>
class Timer {
protected:
StopWatchInterface* _timer;
bool _started;
public:
Timer() : _started(false) {
sdkCreateTimer(&_timer);
}
~Timer() {
sdkDeleteTimer(&_timer);
}
inline void start () {
_started = true;
sdkResetTimer(&_timer);
sdkStartTimer(&_timer);
}
inline double stop() {
sdkStopTimer(&_timer);
_started = false;
return sdkGetTimerValue(&_timer);
}
inline bool isStarted() const {
return _started;
}
};
#endif /* TIMER_CC_H_ */

View File

@ -0,0 +1,130 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef UTIL_H
#define UTIL_H
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
#include <map>
#include <set>
#include <string>
#include <sstream>
#include <string>
#include <Python.h>
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "../../util/include/matrix.h"
#define PASS_TYPE uint
#define PASS_TRAIN 0x1
#define PASS_TEST 0x2
#define PASS_GC 0x4
#define PASS_MULTIVIEW_TEST (PASS_TEST | 0x8)
#define PASS_MULTIVIEW_TEST_START (PASS_MULTIVIEW_TEST | 0x10)
#define PASS_MULTIVIEW_TEST_END (PASS_MULTIVIEW_TEST | 0x20)
#define PASS_FEATURE_GEN 0x40
#define HAS_FLAG(f, x) (((x) & (f)) == (f))
#define IS_MULTIVIEW_TEST(x) HAS_FLAG(PASS_MULTIVIEW_TEST, x)
#define IS_MULTIVIEW_TEST_START(x) HAS_FLAG(PASS_MULTIVIEW_TEST_START, x)
#define IS_MULTIVIEW_TEST_END(x) HAS_FLAG(PASS_MULTIVIEW_TEST_END, x)
#define IS_TEST(x) HAS_FLAG(PASS_TEST, x)
#define IS_TRAIN(x) HAS_FLAG(PASS_TRAIN, x)
// For gradient checking
#define GC_SUPPRESS_PASSES false
#define GC_REL_ERR_THRESH 0.02
#ifdef DO_PRINT
#define PRINT(x, args...) printf(x, ## args);
#else
#define PRINT(x, args...) ;
#endif
/*
* Generates a random floating point number in the range 0-1.
*/
#define randf ((float)rand() / RAND_MAX)
//typedef std::vector<Matrix*> MatrixV;
//typedef std::vector<NVMatrix*> NVMatrixV;
typedef std::map<std::string,std::vector<double>*> CostMap;
typedef std::map<std::string,double> CostCoeffMap;
typedef std::vector<double> doublev;
typedef std::vector<float> floatv;
typedef std::vector<int> intv;
typedef std::vector<std::string> stringv;
typedef std::set<int> seti;
typedef std::vector<PyObject*> PyObjectV;
stringv* getStringV(PyObject* pyList);
floatv* getFloatV(PyObject* pyList);
intv* getIntV(PyObject* pyList);
MatrixV* getMatrixV(PyObject* pyList);
MatrixV* getMatrixV(PyObject* pyList, int len);
int* getIntA(PyObject* pyList);
int pyDictGetInt(PyObject* dict, const char* key);
intv* pyDictGetIntV(PyObject* dict, const char* key);
std::string pyDictGetString(PyObject* dict, const char* key);
float pyDictGetFloat(PyObject* dict, const char* key);
floatv* pyDictGetFloatV(PyObject* dict, const char* key);
Matrix* pyDictGetMatrix(PyObject* dict, const char* key);
MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key);
int* pyDictGetIntA(PyObject* dict, const char* key);
stringv* pyDictGetStringV(PyObject* dict, const char* key);
bool pyDictHasKey(PyObject* dict, const char* key);
PyObjectV* pyDictGetValues(PyObject* dict);
template<typename T> std::string tostr(T n);
template<typename T> void shuffleVector(std::vector<T>& v, int start, int end);
template<class T> void deleteElements(std::vector<T*>& v);
template<class T> void deleteElements(std::vector<T*>& v, bool deleteContainer);
template<class T>
int indexOf(std::vector<T>& v, T e) {
int i = 0;
// typename vector<T>::iterator it2 = v.begin();
for (typename std::vector<T>::const_iterator it = v.begin(); it != v.end(); ++it) {
if (*it == e) {
return i;
}
++i;
}
return -1;
}
std::vector<int>& getDeviceCPUs(int deviceID);
template<typename K, typename V> std::set<K> getKeys(std::map<K,V>& m) {
std::set<K> s;
for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it) {
s.insert(it->first);
}
return s;
}
struct LayerIDComparator {
bool operator()(PyObject* i, PyObject* j) {
return pyDictGetInt(i, "id") < pyDictGetInt(j, "id");
}
};
#endif /* UTIL_H */

View File

@ -0,0 +1,159 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef WEIGHTS_CUH
#define WEIGHTS_CUH
#include <string>
#include <vector>
#include <iostream>
#include <helper_cuda.h>
#include <assert.h>
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "../../util/include/matrix.h"
#include "util.cuh"
#include "lr.cuh"
#include "layer.cuh"
#include "copypipeline.cuh"
#include "reducepipeline.cuh"
#include "streambroadcast.cuh"
class Layer;
class Weights;
class StreamBroadcast;
class IWeightReducer {
protected:
int _tgtReplicaID;
std::map<int,Weights*> _replicas;
int getDeviceID();
public:
IWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
virtual ~IWeightReducer();
static IWeightReducer& make(std::map<int,Weights*>& replicas, int srcReplicaID);
virtual void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) = 0;
};
class SequentialWeightReducer : public IWeightReducer {
protected:
StreamBroadcast* _sb;
public:
SequentialWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
~SequentialWeightReducer();
void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
};
class ParallelWeightReducer : public IWeightReducer {
protected:
IEightGPUReducer* _reducer;
public:
ParallelWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
~ParallelWeightReducer();
void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
};
class Weights {
protected:
Matrix* _hWeights, *_hWeightsInc;
NVMatrix* _weights, *_weightsInc, *_weightsGrad;
ParameterSchedule* _lrs;
float _wc, _mom, _wball;
bool _onGPU, _useGrad, _cleanup;
int _numUpdates;
// Note: every layer is its own sibling too
std::map<int,Weights*> _replicas;
// Non-NULL if these weights are really shared from some other layer
Weights* _srcWeights;
Layer* _parent;
int _shardSize;
IWeightReducer* _reducer;
ISafeBroadcastNetwork* _broadcaster;
void aggregateReplicaGradients(float progress);
// TODO: assert that these retrun contiguous views
template<class T> T& getShard(T& mat, int replicaID);
template<class T> T& getShard(T& mat);
void init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup);
public:
NVMatrix& operator*() const;
Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent);
Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent,
float wc, float wball, float mom, bool useGrad);
virtual ~Weights();
virtual NVMatrix& getW() const;
virtual NVMatrix& getInc() const;
virtual NVMatrix& getGrad() const;
virtual Matrix& getCPUW() const;
virtual Matrix& getCPUWInc() const;
virtual ParameterSchedule& getLearningRateSchedule() const;
virtual int getNumRows() const;
virtual int getNumCols() const;
virtual void copyToCPU();
// This function is assumed to be called in the order in which the layers
// were defined
virtual void copyToGPU();
virtual void update(float progress);
virtual void addReplica(Weights& sibling);
int incNumUpdates();
// Returns the number of times a gradient has been computed for this
// weight matrix during the current pass (interval between two calls of update())
// through the net. This number will only be greater than 1 if this weight matrix
// is *shared* by multiple layers in the net.
int getNumUpdates() const;
float getEps(float progress) const;
float getMom() const;
float getWC() const;
float getWBall() const;
bool isUseGrad() const;
bool isOwner() const;
int getReplicaID();
int getDeviceID();
Layer& getParent();
std::map<int,Weights*>& getReplicas();
ISafeBroadcastNetwork& getBroadcaster();
IWeightReducer& getReducer();
};
class WeightList {
private:
std::vector<Weights*> _weightList;
public:
Weights& operator[](const int idx) const;
~WeightList();
WeightList();
Weights& at(const int i) const;
void addWeights(Weights& w);
void addReplica(WeightList& sibling);
void update(float progress);
void copyToCPU();
void copyToGPU();
int getSize() const;
};
#endif /* WEIGHTS_CUH */

View File

@ -0,0 +1,123 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef WORKER_CUH
#define WORKER_CUH
#include "convnet.cuh"
#include "cost.cuh"
#include "data.cuh"
class ConvNet;
class Cost;
class WorkResult {
public:
enum RESULTS {BATCH_DONE, SYNC_DONE};
protected:
WorkResult::RESULTS _resultType;
Cost* _results;
public:
WorkResult(WorkResult::RESULTS resultType, Cost& results);
WorkResult(WorkResult::RESULTS resultType);
virtual ~WorkResult();
Cost& getResults() const;
WorkResult::RESULTS getResultType() const;
};
class Worker {
protected:
ConvNet* _convNet;
public:
Worker(ConvNet& convNet);
virtual ~Worker();
virtual bool run() = 0;
};
class DataWorker : public Worker {
protected:
CPUData* _data;
DataProvider* _dp;
public:
DataWorker(ConvNet& convNet, CPUData& data);
virtual ~DataWorker();
bool run();
virtual void _run() = 0;
};
class TrainingWorker : public DataWorker {
protected:
bool _test;
double _progress;
public:
TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test);
void _run();
};
class SyncWorker : public Worker {
public:
SyncWorker(ConvNet& convNet);
bool run();
};
class ExitWorker : public Worker {
public:
ExitWorker(ConvNet& convNet);
bool run();
};
class GradCheckWorker : public DataWorker {
public:
GradCheckWorker(ConvNet& convNet, CPUData& data);
void _run();
};
class MultiviewTestWorker : public DataWorker {
protected:
int _numViews;
Matrix* _cpuProbs;
std::string _logregName;
CPUData& getMinibatch(int v, int i);
public:
MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName);
MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews);
~MultiviewTestWorker();
void _run();
};
class FeatureWorker : public DataWorker {
protected:
MatrixV *_ftrs;
stringv *_layerNames;
bool _deleteFeatures;
public:
FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures=true);
~FeatureWorker();
void _run();
};
class DataGradWorker : public DataWorker {
protected:
Matrix* _dataGrads;
int _dataLayerIdx, _softmaxLayerIdx;
public:
DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx);
~DataGradWorker();
void _run();
};
#endif/* WORKER_CUH */

View File

@ -0,0 +1,107 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/actbroadcaster.cuh"
using namespace std;
/*
* =====================
* BroadcastMessage
* =====================
*/
BroadcastMessage::BroadcastMessage(map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue)
: _type(BROADCAST), _mats(mats), _srcDevice(srcDevice), _userIdx(userIdx), _finishQueue(&finishQueue) {
}
BroadcastMessage::BroadcastMessage(MESSAGE_TYPE type)
: _type(type), _finishQueue(NULL) {
}
int BroadcastMessage::getSrcDevice() {
return _srcDevice;
}
map<int, NVMatrix*>& BroadcastMessage::getMatrices() {
return _mats;
}
int BroadcastMessage::getUserIdx() {
return _userIdx;
}
Queue<int>& BroadcastMessage::getFinishQueue() {
return *_finishQueue;
}
BroadcastMessage::MESSAGE_TYPE BroadcastMessage::getMessageType() {
return _type;
}
/*
* =====================
* ExitBroadcastMessage
* =====================
*/
ExitBroadcastMessage::ExitBroadcastMessage() : BroadcastMessage(BroadcastMessage::EXIT) {
}
/*
* =====================
* ActBroadcaster
* =====================
*/
ActBroadcaster::ActBroadcaster(int numUsers, intv& cpus) : Thread(true, cpus), _numUsers(numUsers) {
}
ActBroadcaster::~ActBroadcaster() {
for (map<int,IBroadcastNetwork*>::const_iterator it = _broadcasters.begin(); it != _broadcasters.end(); ++it) {
delete it->second;
}
}
Queue<BroadcastMessage*>& ActBroadcaster::getMessageQueue() {
return _messageQueue;
}
void* ActBroadcaster::run() {
int nextUserIdx = 0;
bool exit = false;
while (!exit) {
BroadcastMessage& msg = *_messageQueue.dequeue();
if (msg.getMessageType() == BroadcastMessage::EXIT) {
exit = true;
delete &msg;
} else {
if (msg.getUserIdx() == nextUserIdx) {
if (_broadcasters.count(msg.getSrcDevice()) == 0) {
_broadcasters[msg.getSrcDevice()] = &IBroadcastNetwork::make(getKeys(msg.getMatrices()), msg.getSrcDevice());
}
_broadcasters[msg.getSrcDevice()]->broadcast(msg.getMatrices());
msg.getFinishQueue().enqueue(0);
delete &msg;
nextUserIdx = (nextUserIdx + 1) % _numUsers;
} else {
_messageQueue.enqueue(&msg);
}
}
}
return NULL;
}
void ActBroadcaster::stop() {
getMessageQueue().enqueue(new ExitBroadcastMessage());
join();
}

View File

@ -0,0 +1,782 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vector>
#include <iostream>
#include <string>
#include <set>
#include <map>
#include "../../nvmatrix/include/nvmatrix.cuh"
#include "../../nvmatrix/include/nvmatrix_operators.cuh"
#include "../../util/include/matrix.h"
#include "../include/convnet.cuh"
#include "../include/util.cuh"
using namespace std;
/*
* =======================
* ConvNet
* =======================
*/
ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs,
int minibatchSize, bool conserveMem) : Thread(true) {
_deviceIDs = deviceIDs;
_data = NULL;
_bufferData = NULL;
_bufferMinibatchIdx = -1;
_bufferPassIdx = -1;
_trainingProgress = 0;
_totalPassesDone = 0;
_conserveMem = conserveMem;
_sync = new ThreadSynchronizer(deviceIDs.size() + 1);
PyObjectV* layerList = pyDictGetValues(layerParams);
std::sort(layerList->begin(), layerList->end(), LayerIDComparator());
_dataCopyPD = new PipeDispenserBlocking(DIVUP(_deviceIDs.size(),2)); // hard-coded for now
initDataLayers(layerList);
initGPUThreads(layerList);
connectReplicas(); // Connect replicas to one another
connectChildren(layerParams); // Connect forward/backward links in graph
_numFwdTerminal = 0;
// Execute post-initialization stuff
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
for (int r = 0; r < it->second.size(); r++) {
_numFwdTerminal += it->second[r]->getNext().size() == 0;
if (it->second[r]->getNext().size() == 0) {
printf("Fwd terminal: %s\n", it->second[r]->getName().c_str());
}
it->second[r]->postInit();
}
}
// Find and count the terminal nodes in the backward pass
for (int p = 0; p < getNumPasses(); p++) {
set<Layer*> visited;
_numBwdTerminal[p] = 0;
for (int t = 0; t < _convNetThreads.size(); t++) {
vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
for (int c = 0; c < cl.size(); c++) {
findBwdTerminal(*cl[c], visited, _numBwdTerminal[p], p);
}
}
}
_dp = new DataProvider(minibatchSize);
// Py_DECREF(layerList);
delete layerList;
}
ConvNet::~ConvNet() {
for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
(*it)->getMessageQueue().enqueue(new Message(EXIT_CONVNET));
(*it)->join();
delete *it;
}
for (DataLayerVector::const_iterator it = _dataLayers.begin(); it != _dataLayers.end(); ++it) {
delete *it;
}
for (intv::const_iterator it = _deviceIDs.begin(); it != _deviceIDs.end(); ++it) {
DEVICE_MEMORY_MANAGER::destroyInstance(*it);
}
HOST_MEMORY_MANAGER::destroyInstance();
delete _sync;
delete _dataCopyPD;
delete _dp;
}
void ConvNet::stop() {
getWorkerQueue().enqueue(new ExitWorker(*this));
join();
}
PipeDispenser& ConvNet::getDataCopyPD() {
return *_dataCopyPD;
}
void ConvNet::initDataLayers(PyObjectV* layerList) {
for (int i = 0; i < layerList->size(); i++) {
PyObject* paramsDict = layerList->at(i);
std::string layerType = pyDictGetString(paramsDict, "type");
if (layerType == "data") {
int numReplicas = pyDictGetInt(paramsDict, "numReplicas");
for (int r = 0; r < numReplicas; ++r) {
DataLayer* dataLayer = new DataLayer(this, paramsDict, r);
_dataLayers.push_back(dataLayer);
_layerMap[dataLayer->getName()][r] = dataLayer;
}
}
}
}
void ConvNet::initGPUThreads(PyObjectV* layerList) {
// Initialize GPU worker threads
for (int i = 0; i < _deviceIDs.size(); ++i) {
ConvNetThread* cng = new ConvNetThread(layerList, _deviceIDs[i], i, this);
_convNetThreads.push_back(cng);
for (NameLayerMap::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
const std::string& name = it->first;
Layer* layer = it->second;
_layerMap[name][layer->getReplicaID()] = layer;
}
}
}
void ConvNet::connectReplicas() {
_numReplicasMax = 0;
_numReplicasMin = 1 << 16;
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
_numReplicasMax = max(_numReplicasMax, int(it->second.size()));
_numReplicasMin = min(_numReplicasMin, int(it->second.size()));
for (map<int,Layer*>::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) {
Layer& l1 = *it2->second;
for (map<int,Layer*>::iterator it3 = it->second.begin(); it3 != it->second.end(); ++it3) {
Layer& l2 = *it3->second;
l1.addReplica(l2);
}
}
}
}
void ConvNet::connectChildren(PyObject* layerParams) {
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
if (inputList != NULL) {
// Iterate over "replicas" of this layer
int numReplicas = _layerMap[it->first].size();
for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
std::string inputName = PyString_AsString(PyList_GetItem(inputList, i));
int numReplicasPrev = _layerMap[inputName].size();
// How many replicas from the previous layer must this layer be connected to?
int numInputReplicas = numReplicasPrev / numReplicas;
for (int r = 0; r < numReplicas; r++) {
for (int rp = r, ridx = 0; ridx < numInputReplicas; rp += numReplicas, ridx++) {
it->second[r]->addPrev(*_layerMap[inputName][rp], ridx);
_layerMap[inputName][rp]->addNext(*it->second[r]);
}
}
}
}
}
}
void ConvNet::findBwdTerminal(Layer& l, set<Layer*>& visited, int& terminal, int passIdx) {
if (visited.count(&l) == 0) {
visited.insert(&l);
if (l.isGradConsumer()) {
bool hasPrevConsumer = false;
if (l.getPrev().size() > 0) {
for (int i = 0; i < l.getPrev()[0].size(); i++) {
// Looking only at 0th replica is fine to see if you have
// grad consumers below you.
hasPrevConsumer |= l.getPrev()[0][i]->isGradConsumer();
}
}
if (!hasPrevConsumer || !l.isGradProducer() || (passIdx + 1 < l.getNumReplicasPrev() && l.getNumReplicasPrev() > l.getNumReplicas())) {
terminal++;
l.setBwdTerminal(passIdx);
printf("found bwd terminal %s[%d] in passIdx=%d\n", l.getName().c_str(), l.getReplicaID(), passIdx);
} else if (l.isGradProducer()) {
for (int r = 0; r < l.getPrev().size(); r++) {
for (int i = 0; i < l.getPrev()[r].size(); i++) {
findBwdTerminal(*l.getPrev()[r][i], visited, terminal, passIdx);
}
}
}
}
}
}
void* ConvNet::run() {
for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
(*it)->start();
}
// The manager thread defaults to using the GPU of the first worker.
// Put more logic here if this is inappropriate.
NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
copyToGPU();
bool exit = false;
while (!exit) {
Worker* worker = _workerQueue.dequeue();
exit = worker->run();
delete worker;
}
return NULL;
}
Queue<Worker*>& ConvNet::getWorkerQueue() {
return _workerQueue;
}
Queue<WorkResult*>& ConvNet::getResultQueue() {
return _resultQueue;
}
DataProvider& ConvNet::getDataProvider() {
return *_dp;
}
Layer& ConvNet::getLayer(std::string& name, int replicaID) {
return *_layerMap[name][replicaID];
}
void ConvNet::sendMessage(MESSAGES msg, bool sync) {
sendMessage(new Message(msg), sync);
}
void ConvNet::sendMessage(Message* msg, bool sync) {
for (int i = 0; i < _convNetThreads.size(); i++) {
_convNetThreads[i]->getMessageQueue().enqueue(msg->clone());
}
delete msg;
if (sync) {
syncWithChildren();
}
}
void ConvNet::copyToCPU() {
sendMessage(COPY_TO_CPU, true);
}
void ConvNet::copyToGPU() {
sendMessage(COPY_TO_GPU, false);
}
void ConvNet::updateWeights(int passIdx) {
sendMessage(UPDATE_WEIGHTS, true);
sendMessage(CONSTRAIN_WEIGHTS, true);
}
void ConvNet::reset(int passIdx) {
sendMessage((passIdx % getNumPasses()) == 0 ? RESET : RESET_PASS_IDX, false);
}
void ConvNet::reset() {
reset(0);
}
// Fprop given data
void ConvNet::fprop(CPUData& data, int passIdx, PASS_TYPE passType) {
reset(passIdx);
// This is necessary because setData below could delete data. If there's
// an outstanding copy request, this'll cause a segfault.
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->waitForCopyFinish();
}
setData(data, passIdx);
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->fprop(passType, passIdx, false);
}
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
}
// Fprop given minibatch idx
void ConvNet::fprop(int miniIdx, int passIdx, PASS_TYPE passType) {
reset(passIdx);
bool fromBuffer = miniIdx == _bufferMinibatchIdx && passIdx == _bufferPassIdx;
if (!fromBuffer) {
// This is necessary because setData below could delete data. If there's
// an outstanding copy request, this'll cause a segfault.
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->waitForCopyFinish();
}
setData(_dp->getMinibatch(miniIdx), passIdx);
} else {
setDataFromBuffer();
}
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->fprop(passType, passIdx, fromBuffer);
}
if (passIdx == getNumPasses() - 1) {
// Do double-buffering from next minibatch from the DataProvider
setBuffer(miniIdx == _dp->getNumMinibatches() - 1 ? NULL : &_dp->getMinibatch(miniIdx + 1), miniIdx + 1, 0);
} else {
// Do double-buffering from next microbatch within current minibatch
setBuffer(_data, miniIdx, passIdx + 1);
}
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
}
void ConvNet::setDataFromBuffer() {
if (_bufferData != _data) {
delete _data;
}
_data = _bufferData;
_bufferData = NULL;
_bufferMinibatchIdx = -1;
_bufferPassIdx = -1;
}
void ConvNet::setData(CPUData& data, int passIdx) {
bool same = _data == _bufferData;
if (&data != _data) {
delete _data;
}
if (&data != _bufferData && !same) {
delete _bufferData;
_bufferData = NULL;
_bufferMinibatchIdx = -1;
_bufferPassIdx = -1;
}
_data = &data;
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->copyData(*_data, false, passIdx);
}
}
void ConvNet::setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx) {
_bufferData = bufferData;
_bufferMinibatchIdx = bufferMinibatchIdx;
_bufferPassIdx = bufferPassIdx;
if (bufferData != NULL) {
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->copyData(*_bufferData, true, bufferPassIdx);
}
}
}
CPUData& ConvNet::getData() {
assert(_data != NULL);
return *_data;
}
void ConvNet::bprop(int passIdx, PASS_TYPE passType) {
_totalPassesDone++;
sendMessage(new BpropStartMessage(passType, passIdx), false);
waitForTerminals(_numBwdTerminal[passIdx], BPROP_TERMINAL);
reset(passIdx + 1);
}
void ConvNet::waitForTerminals(int numMsgs, MESSAGES msgType) {
for (int rcvd = 0; rcvd < numMsgs; rcvd++) {
Message* m = _msgQueue.dequeue();
assert(m->getType() == msgType);
delete m;
}
}
// Same as getCost() but adds results to given cost and returns it
Cost& ConvNet::getCost(Cost& cost) {
Cost &tmp = getCost();
cost += tmp;
delete &tmp;
return cost;
}
Cost& ConvNet::getCost() {
Cost& cost = *new Cost();
for (int t = 0; t < _convNetThreads.size(); t++) {
Cost& tcost = _convNetThreads[t]->getCost();
cost += tcost;
delete &tcost;
}
return cost;
}
double ConvNet::getCostValue() {
Cost& cost = getCost();
double val = cost.getValue();
delete &cost;
return val;
}
Queue<Message*>& ConvNet::getMessageQueue() {
return _msgQueue;
}
intv& ConvNet::getDeviceIDs() {
return _deviceIDs;
}
ThreadSynchronizer& ConvNet::getSync() {
return *_sync;
}
void ConvNet::syncWithChildren() {
sendMessage(SYNC, false);
_sync->sync();
}
int ConvNet::getTotalPassesDone() {
return _totalPassesDone;
}
int ConvNet::getMinibatchSize() {
return _dp->getMinibatchSize();
}
int ConvNet::getNumReplicasMax() {
return _numReplicasMax;
}
int ConvNet::getNumReplicasMin() {
return _numReplicasMin;
}
int ConvNet::getNumPasses() {
return _numReplicasMax / _numReplicasMin;
}
void ConvNet::setTrainingProgress(double progress) {
_trainingProgress = progress;
}
double ConvNet::getTrainingProgress() const {
return _trainingProgress;
}
bool ConvNet::isConserveMemory() {
return _conserveMem;
}
/*
* Gradient checking stuff
*/
void ConvNet::checkGradients() {
_numFailures = 0;
_numTests = 0;
_baseErr = 0;
for (int p = 0; p < getNumPasses(); ++p) {
fprop(0, p, PASS_GC);
_baseErr += getCostValue();
bprop(p, PASS_GC);
}
// We call grad check only on the first replica,
// but because weights are aware of their fellow replicas,
// we can simultaneously perturb the weights of all
// replicas.
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
map<int, Layer*>& layers = it->second;
if (layers[0]->getDeviceID() >= 0 /*&& (layers[0]->getName() == "fc10")*/) { // If layer on GPU (data layers aren't)
layers[0]->checkGradient();
}
}
cout << "------------------------" << endl;
if (_numFailures > 0) {
cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
} else {
cout << "ALL " << _numTests << " TESTS PASSED" << endl;
}
}
// Copies to all replicas
void ConvNet::checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights) {
int d = NVMatrix::getDeviceID();
for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
NVMatrix::setDeviceID(it->second->getDeviceID());
it->second->getW().copyFromHost(weightsCPU);
}
NVMatrix::setDeviceID(d);
}
/*
* name: weight matrix name
* eps: finite difference step
*/
bool ConvNet::checkGradient(const std::string& name, float eps, Weights& weights) {
Matrix numGrad(weights.getNumRows(), weights.getNumCols());
Matrix diff(numGrad);
numGrad.apply(Matrix::ZERO);
Matrix weightsCPU;
weights.getW().copyToHost(weightsCPU, true);
for(int i = 0; i < weights.getNumRows(); i++) {
for (int j = 0; j < weights.getNumCols(); j++) {
float v = weightsCPU(i,j);
weightsCPU(i,j) += eps;
checkGradient_copyWeightsToGPU(weightsCPU, weights);
weightsCPU(i,j) = v;
double err = 0;
for (int p = 0; p < getNumPasses(); ++p) {
// printf("trying fprop %d\n", p);
fprop(0, p, PASS_GC);
// printf(" success\n");
err += getCostValue();
}
numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
if (isnan((double)numGrad(i,j)) || isinf((double)numGrad(i,j))) {
cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
cout << "Exiting." << endl;
exit(1);
}
checkGradient_copyWeightsToGPU(weightsCPU, weights);
}
}
Matrix gradCPU;
NVMatrix::setDeviceID(weights.getDeviceID());
map<int,NVMatrix*> mats;
for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
mats[it->first] = &it->second->getGrad();
}
weights.getReducer().reduce(mats, 1, false);
weights.getGrad().copyToHost(gradCPU, true);
gradCPU.scale(-1.0 / _data->getNumCases());
float analNorm = gradCPU.norm();
float numNorm = numGrad.norm();
numGrad.subtract(gradCPU, diff);
float relErr = diff.norm() / analNorm;
bool fail = relErr >= GC_REL_ERR_THRESH;
if (fail || !GC_SUPPRESS_PASSES) {
cout << "========================" << endl;
printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
cout << "========================" << endl;
cout << "Analytic:" << endl;
gradCPU.print(0, 6, 0, 4);
cout << "Numeric:" << endl;
numGrad.print(0, 6, 0, 4);
printf("Analytic norm: %e\n", analNorm);
printf("Numeric norm: %e\n", numNorm);
printf("Relative error: %e\n", relErr);
}
_numTests++;
_numFailures += fail;
return fail;
}
/*
* =======================================================================================================
* ConvNetThread
* =======================================================================================================
*/
ConvNetThread::ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet)
: Thread(true, getDeviceCPUs(deviceID)), _deviceID(deviceID), _convNet(convNet) {
try {
int numLayers = layerList->size();
for (int i = 0; i < numLayers; i++) {
PyObject* paramsDict = layerList->at(i);
std::string layerType = pyDictGetString(paramsDict, "type");
if (layerType != "data") {
intv& gpus = *pyDictGetIntV(paramsDict, "gpu");
int rid = indexOf(gpus, deviceIdx);
if (rid >= 0) {
initLayer(paramsDict, rid);
}
delete &gpus;
}
}
} catch (std::string& s) {
cout << "Error creating ConvNet: " << s << endl;
exit(1);
}
}
ConvNetThread::~ConvNetThread() {
NVMatrix::setDeviceID(_deviceID);
NVMatrix::destroyCublas();
NVMatrix::destroyRandom();
for (NameLayerMap::const_iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
delete it->second;
}
_nameLayerMap.clear();
}
void ConvNetThread::startTimer() {
NVMatrix::syncStream();
_timer.start();
}
double ConvNetThread::stopTimer() {
NVMatrix::syncStream();
return _timer.stop();
}
void ConvNetThread::initLayer(PyObject* paramsDict, int replicaID) {
std::string type = pyDictGetString(paramsDict, "type");
std::string name = pyDictGetString(paramsDict, "name");
if (type == "fc") {
_nameLayerMap[name] = new FCLayer(this, paramsDict, replicaID, false);
} else if (type == "sfc") {
_nameLayerMap[name] = new SplitFCLayer(this, paramsDict, replicaID, false);
} else if (type == "conv") {
_nameLayerMap[name] = new ConvLayer(this, paramsDict, replicaID);
} else if (type == "local") {
_nameLayerMap[name] = new LocalUnsharedLayer(this, paramsDict, replicaID);
} else if (type == "pool") {
_nameLayerMap[name] = &PoolLayer::make(this, paramsDict, replicaID);
} else if (type == "cmpool") {
_nameLayerMap[name] = &CrossMapPoolLayer::make(this, paramsDict, replicaID);
} else if (type == "rnorm") {
_nameLayerMap[name] = new ResponseNormLayer(this, paramsDict, replicaID);
} else if (type == "cmrnorm") {
_nameLayerMap[name] = new CrossMapResponseNormLayer(this, paramsDict, replicaID);
} else if (type == "cnorm") {
_nameLayerMap[name] = new ContrastNormLayer(this, paramsDict, replicaID);
} else if (type == "softmax") {
_nameLayerMap[name] = new SoftmaxLayer(this, paramsDict, replicaID);
} else if (type == "eltsum") {
_nameLayerMap[name] = new EltwiseSumLayer(this, paramsDict, replicaID);
} else if (type == "eltmax") {
_nameLayerMap[name] = new EltwiseMaxLayer(this, paramsDict, replicaID);
} else if (type == "neuron") {
_nameLayerMap[name] = new NeuronLayer(this, paramsDict, replicaID);
} else if (type == "nailbed") {
_nameLayerMap[name] = new NailbedLayer(this, paramsDict, replicaID);
} else if (type == "blur") {
_nameLayerMap[name] = new GaussianBlurLayer(this, paramsDict, replicaID);
} else if (type == "href") {
_nameLayerMap[name] = new HorizontalReflectionLayer(this, paramsDict, replicaID);
} else if (type == "resize") {
_nameLayerMap[name] = new ResizeLayer(this, paramsDict, replicaID);
} else if (type == "rgb2yuv") {
_nameLayerMap[name] = new RGBToYUVLayer(this, paramsDict, replicaID);
} else if (type == "rgb2lab") {
_nameLayerMap[name] = new RGBToLABLayer(this, paramsDict, replicaID);
} else if (type == "rscale") {
_nameLayerMap[name] = new RandomScaleLayer(this, paramsDict, replicaID);
} else if (type == "crop") {
_nameLayerMap[name] = new CropLayer(this, paramsDict, replicaID);
} else if (type == "concat") {
_nameLayerMap[name] = new ConcatenationLayer(this, paramsDict, replicaID);
} else if (type == "pass") {
_nameLayerMap[name] = new PassThroughLayer(this, paramsDict, replicaID);
} else if (type == "dropout") {
_nameLayerMap[name] = new DropoutLayer(this, paramsDict, replicaID);
} else if (type == "dropout2") {
_nameLayerMap[name] = new Dropout2Layer(this, paramsDict, replicaID);
} else if (strncmp(type.c_str(), "cost.", 5) == 0) {
CostLayer *c = &CostLayer::make(this, paramsDict, type, replicaID);
_nameLayerMap[name] = c;
_costs.push_back(c);
} else {
throw std::string("Unknown layer type ") + type;
}
}
/*
* This executes in a new CPU thread so it's OK to initialize CUDA stuff here.
*/
void ConvNetThread::initCuda() {
NVMatrix::setDeviceID(_deviceID);
checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
int d = _convNet->getDeviceIDs()[i];
if (d != _deviceID) {
if (NVMatrix::canAccessPeer(_deviceID, d)) {
printf("Enabling peer access GPU %d --> GPU %d\n", NVMatrix::getDeviceID(), d);
checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
} else {
printf("No peer access GPU %d --> GPU %d\n", _deviceID, d);
}
}
}
// NVMatrix::syncStream();
NVMatrix::initCublas();
NVMatrix::initRandom(/*7*/);
srand(time(0));
}
void* ConvNetThread::run() {
initCuda();
bool exit = false;
while (!exit) {
Message* m = _msgQueue.dequeue();
if (m->getType() == FPROP_READY) {
FpropMessage* msg = static_cast<FpropMessage*>(m);
msg->getToLayer().fprop(msg->getPassType(), msg->getPassIdx());
} else if (m->getType() == BPROP_READY) {
BpropMessage* msg = static_cast<BpropMessage*>(m);
msg->getToLayer().incRcvdBInputMsgs();
msg->getToLayer().bprop(msg->getPassType(), msg->getPassIdx());
} else if (m->getType() == BPROP_START) {
BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
for (int i = 0; i < _costs.size(); i++) {
dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType(), msg->getPassIdx());
}
} else if (m->getType() == SYNC) {
NVMatrix::syncStream();
_convNet->getSync().sync();
} else if (m->getType() == COPY_TO_CPU) {
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
it->second->copyToCPU();
}
} else if (m->getType() == COPY_TO_GPU) {
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
it->second->copyToGPU();
}
} else if (m->getType() == RESET) {
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
it->second->reset();
}
} else if (m->getType() == RESET_PASS_IDX) {
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
it->second->resetPassIdx();
}
} else if (m->getType() == UPDATE_WEIGHTS) {
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
it->second->updateWeights();
}
} else if (m->getType() == CONSTRAIN_WEIGHTS) {
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
it->second->constrainWeights();
}
} else if (m->getType() == EXIT_CONVNET) {
exit = true;
}
delete m;
}
return NULL;
}
Cost& ConvNetThread::getCost() {
// In a single ConvNetThread, all costs are guaranteed to be different
// (i.e. not replicas of one another)
return *new Cost(_costs);
}
Layer& ConvNetThread::getLayer(std::string& name) {
return *_nameLayerMap[name];
}
int ConvNetThread::getDeviceID() {
return _deviceID;
}
Queue<Message*>& ConvNetThread::getMessageQueue() {
return _msgQueue;
}
vector<CostLayer*>& ConvNetThread::getCostLayers() {
return _costs;
}
NameLayerMap& ConvNetThread::getLayerMap() {
return _nameLayerMap;
}
ConvNet& ConvNetThread::getConvNet() {
return *_convNet;
}

View File

@ -0,0 +1,378 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/copypipeline.cuh"
//#include "gpu_util.cuh"
using namespace std;
/* =========================
* ICopySegment
* =========================
*/
ICopySegment::ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue)
: _parent(&parent), _prev(NULL), _stream(NULL), _deviceID(deviceID), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getSourceDeviceID())) {
_execDeviceID = _deviceID;
}
ICopySegment::~ICopySegment() {
if (_stream != NULL) {
checkCudaErrors(cudaStreamDestroy(_stream));
}
}
void* ICopySegment::run() {
assert(_execDeviceID != DEVICE_HOST);
NVMatrix::setDeviceID(_execDeviceID);
checkCudaErrors(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
bool exit = false;
while (!exit) {
CopyMessage& msg = *_queue.dequeue();
if (msg.getType() == CopyMessage::EXIT) {
exit = true;
} else {
bool term = processMessage(msg);
if (term) {
assert(_finishQueue != NULL);
_finishQueue->enqueue(1);
}
}
delete &msg;
}
return NULL;
}
NVMatrix& ICopySegment::getChunk(NVMatrix& mat, int chunkSize, int chunkIdx) {
NVMatrix& line = mat.reshaped(1, mat.getNumElements());
int start = chunkIdx * chunkSize;
int end = min((chunkIdx+1) * chunkSize, mat.getNumElements());
NVMatrix& chunk = line.sliceCols(start, end);
delete &line;
return chunk;
}
inline NVMatrix& ICopySegment::getMatrix(CopyMessage& msg) {
if (getDeviceID() == DEVICE_HOST) {
return _hmat;
}
return msg.getMatrix(getDeviceID());
}
Queue<CopyMessage*>& ICopySegment::getQueue() {
return _queue;
}
inline int ICopySegment::getDeviceID() {
return _deviceID;
}
void ICopySegment::addPrev(ICopySegment& c) {
_prev = &c;
if (_deviceID == DEVICE_HOST) {
_execDeviceID = c.getDeviceID();
}
}
void ICopySegment::addNext(CopyPeer& c) {
_next.push_back(&c);
c.addPrev(*this);
}
bool ICopySegment::isTerminal() const {
return _next.size() == 0;
}
/* =========================
* CopySource
* =========================
*/
CopySource::CopySource(IBroadcastNetwork& parent, int deviceID) : ICopySegment(parent, deviceID, NULL) {
}
bool CopySource::processMessage(CopyMessage& msg) {
assert(msg.getType() == CopyMessage::COPY_START);
int numChunks = min(getMatrix(msg).getNumElements(), max(COPY_MIN_CHUNKS, min(COPY_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), COPY_MIN_CHUNK_SIZE))));
int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks);
// printf("num chunks: %d\n", numChunks);
for (int c = 0; c <= numChunks; ++c) {
for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
(*it)->getQueue().enqueue(new CopyChunkMessage(c, chunkSize, numChunks, msg.getScaleSource(), msg.getScaleTargets(), msg.getMatrices()));
}
}
return false;
}
inline bool CopySource::isSource() const {
return true;
}
/* =========================
* CopyPeer
* =========================
*/
CopyPeer::CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue) : ICopySegment(parent, deviceID, finishQueue) {
}
bool CopyPeer::processMessage(CopyMessage& msg) {
assert(msg.getType() == CopyMessage::COPY_CHUNK);
CopyChunkMessage& cmsg = *static_cast<CopyChunkMessage*>(&msg);
if (cmsg.getChunkIdx() < cmsg.getNumChunks()) {
if (!isTerminal() || (isTerminal() && msg.getScaleTargets() == 0)) {
getMatrix(msg).resize(_prev->getMatrix(msg));
}
// getMatrix(msg).printShape("getMatrix(msg)");
// _prev->getMatrix(msg).printShape("_prev->getMatrix(msg)");
assert(getMatrix(msg).isSameDims(_prev->getMatrix(msg)));
const float scaleSelf = isTerminal() ? msg.getScaleTargets() : 0;
const float scalePrev = _prev->isSource() ? msg.getScaleSource() : 1;
NVMatrix& prevChunk = getChunk(_prev->getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, _stream);
NVMatrix::syncStream(_stream);
delete &prevChunk;
delete &myChunk;
}
for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
(*it)->getQueue().enqueue(new CopyChunkMessage(cmsg));
}
return cmsg.getChunkIdx() >= cmsg.getNumChunks() && isTerminal();
}
inline bool CopyPeer::isSource() const {
return false;
}
/* =========================
* IBroadcastNetwork
* =========================
*/
IBroadcastNetwork& IBroadcastNetwork::make(set<int> devices, int srcDevice) {
if (devices.size() == 8) {
return (new EightGPUBroadcaster1(devices, srcDevice))->construct();
} else if (devices.size() == 1) {
return (new NullBroadcaster(devices, srcDevice))->construct();
} else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
}
return (new NaiveBroadcaster(devices, srcDevice))->construct();
}
IBroadcastNetwork::IBroadcastNetwork(set<int>& devices, int srcDeviceID, int numTerminal)
: _devices(devices), _srcDeviceID(srcDeviceID), _numTerminal(numTerminal), _constructed(false), _src(NULL) {
}
IBroadcastNetwork::~IBroadcastNetwork() {
vector<ICopySegment*> v;
v.insert(v.end(), _peers.begin(), _peers.end());
v.insert(v.end(), _src);
for (vector<ICopySegment*>::const_iterator it = v.begin(); it != v.end(); ++it) {
(*it)->getQueue().enqueue(new CopyMessage(CopyMessage::EXIT));
(*it)->join();
delete *it;
}
}
IBroadcastNetwork& IBroadcastNetwork::construct() {
assert(!_constructed);
pair<vector<int>,vector<int> > gpus = makeGPULists();
_src = new CopySource(*this, _srcDeviceID);
makePeers(gpus);
makeConnections();
_src->start();
for (vector<CopyPeer*>::const_iterator it = _peers.begin(); it != _peers.end(); ++it) {
(*it)->start();
}
_constructed = true;
return *this;
}
pair<vector<int>,vector<int> > IBroadcastNetwork::makeGPULists() {
vector<int> same, other;
for (set<int>::const_iterator it = _devices.begin(); it != _devices.end(); ++it) {
if (*it != _srcDeviceID) {
if (NVMatrix::canAccessPeer(_srcDeviceID, *it)) {
same.insert(same.begin() + rand() % (1 + same.size()), *it);
} else {
other.insert(other.begin() + rand() % (1 + other.size()), *it);
}
}
}
return pair<vector<int>,vector<int> >(same, other);
}
void IBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats) {
_broadcast(mats, 1, 0);
}
void IBroadcastNetwork::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
assert(_constructed);
assert(_finishQueue.getNumElements() == 0);
assert(mats.size() == _devices.size());
assert(mats.size() > 1);
if (mats[_srcDeviceID]->getNumElements() == 0) {
for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
it->second->resize(*mats[_srcDeviceID]);
}
} else {
_src->getQueue().enqueue(new CopyStartMessage(scaleSource, scaleTargets, mats));
for (int i = 0; i < _numTerminal; ++i) {
_finishQueue.dequeue();
}
}
assert(_finishQueue.getNumElements() == 0);
}
int IBroadcastNetwork::getSourceDeviceID() const {
return _srcDeviceID;
}
void IBroadcastNetwork::makePeers(pair<vector<int>,vector<int> >& gpus) {
vector<int>& same = gpus.first, &other = gpus.second;
for (int i = 0; i < same.size(); ++i) {
_peers.push_back(new CopyPeer(*this, same[i], &_finishQueue));
}
for (int i = 0; i < other.size(); ++i) {
_peers.push_back(new CopyPeer(*this, other[i], &_finishQueue));
}
_peers.push_back(new CopyPeer(*this, DEVICE_HOST, &_finishQueue)); // peers[7]
}
/* =========================
* ISafeBroadcastNetwork
* =========================
*/
ISafeBroadcastNetwork& ISafeBroadcastNetwork::make(set<int> devices, int srcDevice) {
if (devices.size() == 1) {
return (new NullBroadcaster(devices, srcDevice))->construct();
} else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
}
return (new NaiveBroadcaster(devices, srcDevice))->construct();
}
ISafeBroadcastNetwork::ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal) : IBroadcastNetwork(devices, srcDeviceID, numTerminal) {
}
void ISafeBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
_broadcast(mats, scaleSource, scaleTargets);
}
ISafeBroadcastNetwork& ISafeBroadcastNetwork::construct() {
IBroadcastNetwork::construct();
return *this;
}
/* =========================
* NullBroadcaster
* =========================
*/
NullBroadcaster::NullBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
}
void NullBroadcaster::makeConnections() {
}
NullBroadcaster& NullBroadcaster::construct() {
_constructed = true;
return *this;
}
void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
}
void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats) {
}
/* =========================
* NaiveBroadcaster
* =========================
*
* This one does src -> host -> all
*/
NaiveBroadcaster::NaiveBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, devices.size()-1) {
}
void NaiveBroadcaster::makeConnections() {
_src->addNext(*_peers.back()); // Make connection src -> host
for (int i = 0; i < _peers.size() - 1; ++i) {
if (_peers[i]->getDeviceID() != _src->getDeviceID()) {
_peers.back()->addNext(*_peers[i]); // Make connection host -> peer
}
}
}
/* =========================
* EightGPUBroadcaster1
* =========================
*
* This one does a fancy graph
*/
EightGPUBroadcaster1::EightGPUBroadcaster1(set<int>& devices, int srcDeviceID) : IBroadcastNetwork(devices, srcDeviceID, 4) {
}
void EightGPUBroadcaster1::makeConnections() {
_src->addNext(*_peers[7]);
_peers[7]->addNext(*_peers[0]);
_peers[7]->addNext(*_peers[1]);
_peers[7]->addNext(*_peers[3]);
_peers[7]->addNext(*_peers[4]);
_peers[1]->addNext(*_peers[2]);
_peers[3]->addNext(*_peers[5]);
_peers[4]->addNext(*_peers[6]);
}
/* =========================
* TwoPeeringGPUsBroadcaster
* =========================
*/
TwoPeeringGPUsBroadcaster::TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
_tgtDeviceID = *devices.begin() == srcDeviceID ? *(++devices.begin()) : *devices.begin();
}
TwoPeeringGPUsBroadcaster::~TwoPeeringGPUsBroadcaster() {
if (_constructed) {
checkCudaErrors(cudaStreamDestroy(_tgtStream));
}
}
void TwoPeeringGPUsBroadcaster::makeConnections() {
}
void TwoPeeringGPUsBroadcaster::resetDeviceID(int d) {
if (d >= 0) {
NVMatrix::setDeviceID(d);
}
}
ISafeBroadcastNetwork& TwoPeeringGPUsBroadcaster::construct() {
assert(!_constructed);
int d = NVMatrix::getDeviceID();
NVMatrix::setDeviceID(_tgtDeviceID);
checkCudaErrors(cudaStreamCreateWithFlags(&_tgtStream, cudaStreamNonBlocking));
resetDeviceID(d);
_constructed = true;
return *this;
}
void TwoPeeringGPUsBroadcaster::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
int d = NVMatrix::getDeviceID();
NVMatrix::setDeviceID(_tgtDeviceID);
mats[_tgtDeviceID]->add(*mats[_srcDeviceID], scaleTargets, scaleSource, *mats[_tgtDeviceID], _tgtStream);
NVMatrix::syncStream(_tgtStream);
resetDeviceID(d);
}

View File

@ -0,0 +1,113 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include "../include/cost.cuh"
using namespace std;
/*
* =====================
* Cost
* =====================
*/
Cost::Cost() {
}
Cost::Cost(vector<CostLayer*>& costs) {
for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
_costMap[(*it)->getName()] = &(*it)->getCost();
_costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
_numCases[(*it)->getName()] = (*it)->getNumCases();
}
}
int Cost::getNumCases() {
return _numCases.size() == 0 ? 0 : _numCases.begin()->second;
}
map<std::string,int>& Cost::getNumCasesMap() {
return _numCases;
}
doublev& Cost::operator [](const std::string s) {
return *_costMap[s];
}
CostMap& Cost::getCostMap() {
return _costMap;
}
CostCoeffMap& Cost::getCostCoeffMap() {
return _costCoeffMap;
}
double Cost::getValue() {
double val = 0;
for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
val += _costCoeffMap[it->first] * (it->second->size() == 0 ? 0 : it->second->at(0));
}
return val;
}
Cost& Cost::operator += (Cost& er) {
CostMap& otherMap = er.getCostMap();
CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
bool newCost = _costMap.count(it->first) == 0;
if (newCost) {
_costMap[it->first] = new doublev();
_costCoeffMap[it->first] = otherCoeffMap[it->first];
_numCases[it->first] = er.getNumCasesMap()[it->first];
} else {
_numCases[it->first] += er.getNumCasesMap()[it->first];
}
doublev& myVec = *_costMap[it->first];
doublev& otherVec = *otherMap[it->first];
assert(myVec.size() == 0 || otherVec.size() == 0 || myVec.size() == otherVec.size());
// Add costs from otherVec to me
for (int i = 0; i < otherVec.size(); i++) {
if (myVec.size() <= i) {
myVec.push_back(0);
}
myVec[i] += otherVec[i];
}
}
return *this;
}
Cost::~Cost() {
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
delete it->second;
}
}
void Cost::print() {
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
printf("%s (%.3f): ", it->first.c_str(), _costCoeffMap[it->first]);
doublev& vec = *_costMap[it->first];
for (int z = 0; z < vec.size(); ++z) {
printf("%.3f", vec[z]);
if (z < vec.size() - 1) {
printf(", ");
}
}
printf("\n");
}
}

View File

@ -0,0 +1,82 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include <vector>
#include "../../util/include/matrix.h"
#include "../include/data.cuh"
#include "../include/timer.cuh"
using namespace std;
DataProvider::DataProvider(int minibatchSize) :
_minibatchSize(minibatchSize), _hData(NULL) {
}
void DataProvider::clearData() {
delete _hData;
_hData = NULL;
}
void DataProvider::setData(CPUData& hData) {
// DataWorker calls clearData
_hData = &hData;
assert(_hData != NULL);
}
CPUData& DataProvider::getMinibatch(int idx) {
assert(idx >= 0 && idx < getNumMinibatches());
return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
}
CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
assert(_hData != 0);
assert(_hData->getNumCases() > 0);
endCase = min(_hData->getNumCases(), endCase);
// TODO: maintain these matrices, no point re-creating them all the time
MatrixV& miniData = *new MatrixV();
for (int i = 0; i < _hData->getData().size(); i++) {
// NOTE: if hData is transposed, then the output minibatch matrix
// can be a view. No need to allocate new CPU memory here. Might
// want to look into optimizing that in the future, though it's
// unlikely to be a big deal.
if (_hData->isTrans()) {
miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
} else {
miniData.push_back(new Matrix());
(*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
}
}
CPUData& cpuData = *new CPUData(&miniData);
return *new CPUData(&miniData);
}
int DataProvider::getNumMinibatches() {
assert(_hData != 0);
assert(_hData->getNumCases() > 0);
return DIVUP(_hData->getNumCases(), _minibatchSize);
}
int DataProvider::getMinibatchSize() {
return _minibatchSize;
}
int DataProvider::getNumCases() {
assert(_hData != 0);
assert(_hData->getNumCases() > 0);
return _hData->getNumCases();
}

View File

@ -0,0 +1,202 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/util.cuh"
#include "../include/gradreducer.cuh"
using namespace std;
/* =====================
* IGradReducer
* =====================
*/
IActGradReducer::IActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
: Thread(true, getDeviceCPUs(parent.getDeviceID())), _parent(&parent), _numExpectedMsgs(numExpectedMsgs) {
_numExpectedMsgsTotal = 0;
for (map<int,int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
_numExpectedMsgsTotal += it->second;
}
// printf("%s[%d] expected %d backward msgs\n", parent.getName().c_str(), parent.getReplicaID(), _numExpectedMsgsTotal);
}
IActGradReducer::~IActGradReducer() {
}
void* IActGradReducer::run() {
while (true) {
reset();
if (reduce()) {
break;
}
_finishQueue.enqueue(0);
}
return NULL;
}
// Cost layer will have nothing to dequeue, so just return immediately.
int IActGradReducer::waitForFinish() {
if (_numExpectedMsgsTotal > 0) {
int i = _finishQueue.dequeue();
assert(_finishQueue.getNumElements() == 0);
return i;
}
// printf("%s not waiting for finish\n", _name.c_str());
return 0;
}
IActGradReducer& IActGradReducer::makeGradReducer(Layer& parent, map<int, int> numExpectedMsgs) {
int tgtDeviceID = parent.getDeviceID();
if (numExpectedMsgs.count(tgtDeviceID) == 0) {
numExpectedMsgs[tgtDeviceID] = 0;
}
if (numExpectedMsgs.size() == 8) {
return *new ParallelActGradReducer(parent, numExpectedMsgs);
}
return *new SequentialActGradReducer(parent, numExpectedMsgs);
}
/* =====================
* SequentialGradReducer
* =====================
*/
SequentialActGradReducer::SequentialActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
: IActGradReducer(parent, numExpectedMsgs) {
intv deviceIDs;
int tgtDeviceID = parent.getDeviceID();
for (map<int, int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
if (it->first != tgtDeviceID) {
deviceIDs.push_back(it->first);
}
}
if (numExpectedMsgs[tgtDeviceID] > 0) {
deviceIDs.push_back(tgtDeviceID);
}
sort(deviceIDs.begin(), deviceIDs.end());
int firstDeviceIdx = 0, firstDeviceID = 1 << 16;
for (int i = 0; i < deviceIDs.size(); ++i) {
if (deviceIDs[i] >= tgtDeviceID && deviceIDs[i] < firstDeviceID) {
firstDeviceIdx = i;
firstDeviceID = deviceIDs[i];
}
}
// This is the order in which we process devices.
for (int i = firstDeviceIdx; _deviceIDs.size() < deviceIDs.size(); i = (i + 1) % deviceIDs.size()) {
int d = deviceIDs[i];
_deviceIDs.push_back(d);
_messageQueues[d] = new Queue<int>();
}
//shuffleVector(_deviceIDs, 1, _deviceIDs.size());
_broadcaster = new StreamBroadcast();
// Note that we MUST process the tgtDeviceID first because
// we write to it at every iteration, and the computation
// thread writes to it too. By processing it first we ensure
// that there's no race condition.
assert(numExpectedMsgs[tgtDeviceID] == 0 || _deviceIDs[0] == tgtDeviceID);
reset();
}
SequentialActGradReducer::~SequentialActGradReducer() {
for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
delete it->second;
}
delete _broadcaster;
}
void SequentialActGradReducer::reset() {
for (map<int,int>::iterator it = _numReceivedMsgs.begin(); it != _numReceivedMsgs.end(); ++it) {
_numReceivedMsgs[it->first] = 0;
}
}
bool SequentialActGradReducer::reduce() {
int tgtDeviceID = _parent->getDeviceID();
for (int didx = 0; didx < _deviceIDs.size(); ) {
int d = _deviceIDs[didx];
_numReceivedMsgs[d] += _messageQueues[d]->dequeue();
if (_numReceivedMsgs[d] == _numExpectedMsgs[d]) {
if (d != tgtDeviceID) {
NVMatrix::setDeviceID(tgtDeviceID);
_parent->getActsGrad().resize(_parent->getActsGrad(d));
map<int, NVMatrix*> mats;
mats[d] = &_parent->getActsGrad(d);
mats[tgtDeviceID] = &_parent->getActsGrad(tgtDeviceID);
_broadcaster->transfer(mats, d, didx > 0, 1);
}
didx++;
assert(_messageQueues[d]->getNumElements() == 0);
} else if (_numReceivedMsgs[d] >= _numExpectedMsgs[d]) { // exit
return true;
}
}
return false;
}
void SequentialActGradReducer::enqueueReduction(int deviceID) {
_messageQueues[deviceID]->enqueue(1);
}
void SequentialActGradReducer::stop() {
for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
it->second->enqueue(ACT_GRAD_REDUCER_EXIT);
}
join();
}
/* =====================
* ParallelActGradReducer
* =====================
*/
ParallelActGradReducer::ParallelActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
: IActGradReducer(parent, numExpectedMsgs), _numReceivedMsgs(0) {
_reducer = &(new EightGPUReducer1(parent.getDeviceID()))->construct();
_scaleTarget = numExpectedMsgs.count(parent.getDeviceID()) > 0 && numExpectedMsgs[parent.getDeviceID()] > 0;
}
bool ParallelActGradReducer::reduce() {
// TODO: make it so that you can start the reduction before you've received all the messages.
while(_numReceivedMsgs < _numExpectedMsgsTotal) {
_numReceivedMsgs += _messageQueue.dequeue();
}
if (_numReceivedMsgs > _numExpectedMsgsTotal) {
return true; // exit
}
map<int,NVMatrix*> mats = _parent->getAllActsGrads();
_reducer->reduce(mats, 1, _scaleTarget);
assert(_messageQueue.getNumElements() == 0);
return false;
}
void ParallelActGradReducer::enqueueReduction(int deviceID) {
_messageQueue.enqueue(1);
}
void ParallelActGradReducer::stop() {
_messageQueue.enqueue(ACT_GRAD_REDUCER_EXIT);
join();
}
void ParallelActGradReducer::reset() {
_numReceivedMsgs = 0;
}

View File

@ -0,0 +1,135 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/jpeg.h"
using namespace std;
/* ========================
* DecoderThread
* ========================
*/
DecoderThread::DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview)
: Thread(true), _pyList(pyList), _target(&target), _start_img(start_img), _end_img(end_img),
_img_size(img_size), _inner_size(inner_size), _test(test), _multiview(multiview),
_decodeTarget(0), _decodeTargetSize(0) {
_inner_pixels = _inner_size * _inner_size;
_rseed = time(0);
}
DecoderThread::~DecoderThread(){
free(_decodeTarget);
}
void* DecoderThread::run() {
int numSrcCases = PyList_GET_SIZE(_pyList);
assert(_target->getNumCols() == _inner_pixels * 3);
assert(_target->getNumRows() == PyList_GET_SIZE(_pyList) * (_multiview ? 10 : 1));
int width, height;
for (int64 i = _start_img; i < _end_img; ++i) {
decodeJpeg(i, width, height);
assert((width == _img_size && height >= _img_size)
|| (height == _img_size && width >= _img_size));
if (_multiview) {
for (int flip = 0; flip < 2; ++flip) {
crop(numSrcCases * (flip * 5 + 0) + i, width, height, flip, 0, 0); // top-left
crop(numSrcCases * (flip * 5 + 1) + i, width, height, flip, width - _inner_size, 0); // top-right
crop(numSrcCases * (flip * 5 + 2) + i, width, height, flip, (width - _inner_size) / 2, (height - _inner_size) / 2); // center
crop(numSrcCases * (flip * 5 + 3) + i, width, height, flip, 0, height - _inner_size); // bottom-left
crop(numSrcCases * (flip * 5 + 4) + i, width, height, flip, width - _inner_size, height - _inner_size); // bottom-right
}
} else {
crop(i, width, height, !_test && (rand_r(&_rseed) % 2));
}
}
return NULL;
}
void DecoderThread::decodeJpeg(int idx, int& width, int& height) {
PyObject* pySrc = PyList_GET_ITEM(_pyList, idx);
unsigned char* src = (unsigned char*)PyString_AsString(pySrc);
size_t src_len = PyString_GET_SIZE(pySrc);
struct jpeg_decompress_struct cinf;
struct jpeg_error_mgr jerr;
cinf.err = jpeg_std_error(&jerr);
jpeg_create_decompress(&cinf);
jpeg_mem_src(&cinf, src, src_len);
assert(jpeg_read_header(&cinf, TRUE));
cinf.out_color_space = JCS_RGB;
assert(jpeg_start_decompress(&cinf));
assert(cinf.num_components == 3 || cinf.num_components == 1);
width = cinf.image_width;
height = cinf.image_height;
if (_decodeTargetSize < width * height * 3) {
free(_decodeTarget);
_decodeTargetSize = width * height * 3 * 3;
_decodeTarget = (unsigned char*)malloc(_decodeTargetSize);
}
while (cinf.output_scanline < cinf.output_height) {
JSAMPROW tmp = &_decodeTarget[width * cinf.out_color_components * cinf.output_scanline];
assert(jpeg_read_scanlines(&cinf, &tmp, 1) > 0);
}
assert(jpeg_finish_decompress(&cinf));
jpeg_destroy_decompress(&cinf);
}
/*
* Uniform in [0,1)
*/
inline double DecoderThread::randUniform() {
return double(rand_r(&_rseed)) / (int64(RAND_MAX) + 1);
}
/*
* Uniform in [min, max)
*/
inline double DecoderThread::randUniform(double min, double max) {
return (max - min) * randUniform() + min;
}
void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip) {
crop(i, src_width, src_height, flip, -1, -1);
}
void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y) {
const int64 border_size_y = src_height - _inner_size;
const int64 border_size_x = src_width - _inner_size;
if (crop_start_x < 0) {
crop_start_x = _test ? (border_size_x / 2) : (rand_r(&_rseed) % (border_size_x + 1));
}
if (crop_start_y < 0) {
crop_start_y = _test ? (border_size_y / 2) : (rand_r(&_rseed) % (border_size_y + 1));
}
const int64 src_pixels = src_width * src_height;
for (int64 c = 0; c < 3; ++c) {
for (int64 y = crop_start_y; y < crop_start_y + _inner_size; ++y) {
for (int64 x = crop_start_x; x < crop_start_x + _inner_size; ++x) {
assert((y >= 0 && y < src_height && x >= 0 && x < src_width));
_target->getCell(i, c * _inner_pixels + (y - crop_start_y) * _inner_size
+ (flip ? (_inner_size - 1 - x + crop_start_x)
: (x - crop_start_x)))
= _decodeTarget[3 * (y * src_width + x) + c];
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,555 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <assert.h>
#include <vector>
#include <cmath>
#include "../include/layer_kernels.cuh"
using namespace std;
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxEnergies: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
* top5Probs: (1, numCases) (*out)
*
* target: (1, numCases)
*
*/
__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
float* labelLogProbs, float* correctProbs, float* top5Probs,
const int numCases, const int numOut, const int setSize) {
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
if (tx < numCases) {
const int label = int(labels[tx]);
const float maxp = maxProbs[tx];
const float labelp = probs[label * numCases + tx];
labelLogProbs[tx] = __logf(labelp);
int numBiggerProbs = 0, numEqualsProbs = 0;
for (int i = 0; i < numOut; ++i) {
numBiggerProbs += probs[i * numCases + tx] > labelp;
numEqualsProbs += probs[i * numCases + tx] == labelp;
}
const int slotsLeft = setSize - numBiggerProbs;
top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
}
}
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
* top5Probs: (1, numCases) (*out)
*
* target: (1, numCases) == log(y_l[labels,:]
*/
void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize) {
int numCases = probs.getNumCols();
int numOut = probs.getNumRows();
assert(labels.getNumElements() == numCases);
assert(!labels.isTrans());
assert(!probs.isTrans());
assert(labels.isContiguous());
assert(probs.isContiguous());
// NVMatrix& maxProbs = probs.max(0);
labelLogProbs_out.resize(1, numCases);
correctProbs_out.resize(1, numCases);
top5Probs_out.resize(1, numCases);
dim3 threads(LOGREG_ERR_THREADS_X, 1);
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
cudaStream_t stream = NVMatrix::getDefaultStream();
cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
kMultiSoftmaxCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
numCases, numOut, setSize);
getLastCudaError("kMultiSoftmaxCost: Kernel execution failed");
// cudaThreadSynchronize();
}
/*
* E = sum(p_l * log(y_l))
* probs: (numOut, numCases)
* labels: (numOut, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases)
*/
__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
const int numCases, const int numOut) {
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
if (tx < numCases) {
probs += tx;
labels += tx;
maxProbs += tx;
labelLogProbs += tx;
correctProbs += tx;
const float maxp = maxProbs[0];
/*
* Compute the probability of guessing the correct case if you take the most-probable label.
*
* This is done like this:
*
* - If the most probable label is not equal to the true label, then the probability is zero.
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
*
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
*/
float crossEnt = 0.0f;
int numMax = 0;
bool correctLabel = false;
for (int i = 0; i < numOut; i++) {
const float label_prob = labels[i * numCases];
const float model_prob = probs[i * numCases];
numMax += model_prob == maxp;
crossEnt += label_prob * safelog(model_prob);
correctLabel |= model_prob == maxp && label_prob > 0.0f;
}
labelLogProbs[0] = crossEnt;
if (!correctLabel) {
correctProbs[0] = 0.0f;
} else {
correctProbs[0] = 1.0f / float(numMax);
}
}
}
/*
* E = sum(p_l * log(y_l))
* y_l: (numOut, numCases)
* labels: (numOut, numCases)
*
* dE_dy_l: (numOut, numCases)
*/
template <bool add>
__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
const float label_prob = labels[tidx];
const float model_prob = y_l[tidx];
const float v = gradCoeff * __fdividef(label_prob, model_prob);
if (add) {
dE_dy_l[tidx] += v;
} else {
dE_dy_l[tidx] = v;
}
}
}
/*
* E = sum(p_l * log(y_l))
* y_l: (numOut, numCases)
* labels: (numOut, numCases)
*
* dE_dx_l: (numOut, numCases)
*/
template <bool add>
__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
const float model_prob = y_l[tidx];
const float label_prob = labels[tidx];
float v = gradCoeff * (label_prob - model_prob);
if (add) {
dE_dx_l[tidx] += v;
} else {
dE_dx_l[tidx] = v;
}
}
}
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases)
*/
__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
const int numCases, const int numOut) {
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
if (tx < numCases) {
const int label = int(labels[tx]);
const float maxp = maxProbs[tx];
const float labelp = probs[label * numCases + tx];
labelLogProbs[tx] = __logf(labelp);
/*
* Compute the probability of guessing the correct case if you take the most-probable label.
*
* This is done like this:
*
* - If the most probable label is not equal to the true label, then the probability is zero.
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
*
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
*/
if (labelp != maxp) {
correctProbs[tx] = 0;
} else {
int numMax = 0;
for (int i = 0; i < numOut; i++) {
numMax += probs[i * numCases + tx] == maxp;
}
correctProbs[tx] = 1.0f / float(numMax);
}
}
}
/*
* E = -log(y_t)
* y_l: (numOut, numCases)
* labels: (1, numCases)
*
* dE_dy_l: (numOut, numCases)
*/
template <bool add>
__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
const int label = int(labels[tx]);
float v = gradCoeff * (label == ty);
v = __fdividef(v, y_l[tidx]);
if (add) {
dE_dy_l[tidx] += v;
} else {
dE_dy_l[tidx] = v;
}
}
}
/*
* E = -log(y_t)
* y_l: (numOut, numCases)
* labels: (1, numCases)
*
* dE_dx_l: (numOut, numCases)
*/
template <bool add>
__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
const int label = int(labels[tx]);
float v = gradCoeff * ((label == ty) - y_l[tidx]);
if (add) {
dE_dx_l[tidx] += v;
} else {
dE_dx_l[tidx] = v;
}
}
}
/*
* dE_dy_l: (numOut, numCases)
* y_l: (numOut, numCases)
*
* dE_dx_l: (numOut, numCases)
*/
template <bool add>
__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut, const float scaleTarget, const float scaleGrad) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
float v = 0;
for (int j = 0; j < numOut; j++) {
v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
}
v *= y_l[tidx];
if (add) {
dE_dx_l[tidx] = scaleTarget * dE_dx_l[tidx] + scaleGrad * v;
} else {
dE_dx_l[tidx] = scaleGrad * v;
}
}
}
template <int B_X, bool add>
__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
const int numElements) {
for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
if (add) {
target[i] += actGrad[i] * (output[i] == input[i]);
} else {
target[i] = actGrad[i] * (output[i] == input[i]);
}
}
}
void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
assert(actGrad.isContiguous());
assert(output.isContiguous());
assert(input.isContiguous());
assert(actGrad.isSameDims(input));
assert(actGrad.isSameDims(output));
dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
dim3 threads(128);
cudaStream_t stream = NVMatrix::getDefaultStream();
if (add) {
assert(actGrad.isSameDims(target));
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
kEltwiseMaxGrad<128, true><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
} else {
target.resize(actGrad);
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
kEltwiseMaxGrad<128, false><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
}
getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
}
/*
* E = sum_i{-p_i*log(y_i)}
* probs: (numOut, numCases)
* labels: (numOut, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases)
*/
void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
int numCases = probs.getNumCols();
int numOut = probs.getNumRows();
assert(labels.isSameDims(probs));
assert(!labels.isTrans());
assert(!probs.isTrans());
assert(labels.isContiguous());
assert(probs.isContiguous());
NVMatrix& maxProbs = probs.max(0);
labelLogProbs_out.resize(1, numCases);
correctProbs_out.resize(1, numCases);
dim3 threads(LOGREG_ERR_THREADS_X, 1);
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
cudaStream_t stream = NVMatrix::getDefaultStream();
cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
kCrossEntCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
numCases, numOut);
getLastCudaError("kCrossEntCost: Kernel execution failed");
delete &maxProbs;
}
void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.isSameDims(probs));
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(!labels.isTrans());
assert(!probs.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
cudaStream_t stream = NVMatrix::getDefaultStream();
if (!add) {
target.resize(probs);
kCrossEntGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
kCrossEntGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("kCrossEntGrad: Kernel execution failed");
}
void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad) {
int numCases = acts.getLeadingDim();
int numOut = acts.getFollowingDim();
assert(acts.isSameDims(actsGrad));
assert(acts.isContiguous());
assert(actsGrad.isContiguous());
assert(target.isContiguous());
assert(acts.isTrans());
assert(actsGrad.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
cudaStream_t stream = NVMatrix::getDefaultStream();
if (scaleTarget == 0) {
target.resize(acts);
kSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
} else {
kSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
}
getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
}
void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(probs.isTrans());
assert(!labels.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
cudaStream_t stream = NVMatrix::getDefaultStream();
if (!add) {
target.resize(probs);
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
kCrossEntSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
kCrossEntSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
}
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases) == log(y_l[labels,:]
*/
void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
int numCases = probs.getNumCols();
int numOut = probs.getNumRows();
assert(labels.getNumElements() == numCases);
assert(!labels.isTrans());
assert(!probs.isTrans());
assert(labels.isContiguous());
assert(probs.isContiguous());
labelLogProbs_out.resize(1, numCases);
correctProbs_out.resize(1, numCases);
dim3 threads(LOGREG_ERR_THREADS_X, 1);
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
cudaStream_t stream = NVMatrix::getDefaultStream();
cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
kLogregCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
numCases, numOut);
getLastCudaError("computeLogregCost: Kernel execution failed");
}
void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.getNumElements() == numCases);
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(!labels.isTrans());
assert(!probs.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
cudaStream_t stream = NVMatrix::getDefaultStream();
if (!add) {
target.resize(probs);
kLogregCostGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
kLogregCostGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("computeLogregGrad: Kernel execution failed");
}
void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.getNumElements() == numCases);
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(probs.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
cudaStream_t stream = NVMatrix::getDefaultStream();
if (!add) {
target.resize(probs);
kLogregSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
kLogregSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
}

View File

@ -0,0 +1,114 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include "../include/lr.cuh"
#include "../include/util.cuh"
/*
* ==================================
* ParameterSchedule
* ==================================
*/
ParameterSchedule& ParameterSchedule::make(PyObject* schedDict) {
std::string type = pyDictGetString(schedDict, "type");
PyObject* paramsDict = PyDict_GetItemString(schedDict, "params");
double base = pyDictGetFloat(paramsDict, "base");
if (type == "const") {
return *new ParameterSchedule(base);
} else {
double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
if (type == "linear") {
return *new LinearParameterSchedule(base, tgtFactor);
} else if (type == "exp") {
return *new ExpParameterSchedule(base, tgtFactor);
} else if (type == "dexp") {
double numSteps = pyDictGetInt(paramsDict, "numSteps");
return *new DiscreteExpParameterSchedule(base, tgtFactor, numSteps);
}
}
throw std::string("Unknown learning rate schedule type ") + type;
}
ParameterSchedule::ParameterSchedule(double baseRate)
: _baseRate(baseRate) {
}
double ParameterSchedule::getValue(double progress) {
return _baseRate;
}
double ParameterSchedule::getBaseValue() const {
return _baseRate;
}
ParameterSchedule::~ParameterSchedule() {
}
/*
* ==================================
* LinearParameterSchedule
* ==================================
*/
LinearParameterSchedule::LinearParameterSchedule(double baseRate, double tgtFactor)
: ParameterSchedule(baseRate) {
_finalRate = baseRate / tgtFactor;
}
double LinearParameterSchedule::getValue(double progress) {
return _baseRate * (1 - progress) + _finalRate * progress;
}
/*
* ==================================
* ExpParameterSchedule
* ==================================
*/
ExpParameterSchedule::ExpParameterSchedule(double baseRate, double tgtFactor)
: ParameterSchedule(baseRate) {
_powBase = 1.0 / tgtFactor;
}
double ExpParameterSchedule::getValue(double progress) {
return _baseRate * std::pow(_powBase, progress);
}
/*
* ==================================
* DiscreteExpParameterSchedule
* ==================================
*/
DiscreteExpParameterSchedule::DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps)
: ParameterSchedule(baseRate) {
ExpParameterSchedule elrs(baseRate, tgtFactor);
double finalRate = baseRate / tgtFactor;
for (int i = 0; i < numSteps - 1; i++) {
double progress = double(i) / (numSteps - 1);
_rates.push_back(elrs.getValue(progress));
}
_rates.push_back(finalRate);
//printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
}
double DiscreteExpParameterSchedule::getValue(double progress) {
for (int i = 0; i < _rates.size(); ++i) {
if (progress <= double(i + 1) / _rates.size()) {
return _rates[i];
}
}
return _rates.back();
}

View File

@ -0,0 +1,139 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/memorysource.cuh"
using namespace std;
/*
* =======================
* MemoryView
* =======================
*/
MemoryView::MemoryView(MemorySource& src, std::string& name) : _src(&src), _name(name) {
}
MemoryView::~MemoryView() {
// if (_src->truncate(_name)) {
// delete _src;
// }
}
NVMatrix& MemoryView::getMemory(int numCases) {
return _src->getMemory(_name, numCases);
}
NVMatrix& MemoryView::getMemory() {
return _src->getMemory(_name);
}
MemorySource& MemoryView::getMemorySource() {
return *_src;
}
bool MemoryView::isParent() {
return _src->getRange(_name).first == 0 && _src->getRange(_name).second == _src->getSize();
}
std::string& MemoryView::getName() {
return _name;
}
MemoryView& MemoryView::clone(std::string& name) {
return _src->addUser(name, _src->getRange(_name));
}
/*
* =======================
* MemorySource
* =======================
*/
MemorySource::MemorySource(int size, int deviceID) : _size(size), _deviceID(deviceID) {
}
MemorySource::~MemorySource() {
// Each MemoryView is deleted by owner Layer, and the last one deletes the MemorySource.
// So this is a no-op.
}
NVMatrix& MemorySource::getMemory(std::string& name) {
return getMemory(name, _memory.getLeadingDim());
}
// Deletes old view when appropriate
NVMatrix& MemorySource::getMemory(std::string& name, int numCases) {
numCases = numCases < 0 ? _memory.getLeadingDim() : numCases;
_lock.acquire();
if (_memory.getLeadingDim() != numCases || _memory.getFollowingDim() != _size) {
int d = NVMatrix::getDeviceID();
NVMatrix::setDeviceID(_deviceID);
_memory.resize(_size, numCases, false);
for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
delete it->second;
}
_memoryViews.clear();
if (d >= 0) {
NVMatrix::setDeviceID(d);
}
}
if (_memoryViews.count(name) == 0) {
assert(!_memory.isTrans());
_memoryViews[name] = &_memory.sliceRows(_viewRanges[name].first, _viewRanges[name].second);
}
NVMatrix& view = *_memoryViews[name];
assert(view.isContiguous());
_lock.release();
return view;
}
MemoryView& MemorySource::addUser(std::string& name, std::pair<int,int> range) {
assert(_viewRanges.count(name) == 0);
_viewRanges[name] = range;
return *new MemoryView(*this, name);
}
MemoryView& MemorySource::addUser(std::string& name) {
return addUser(name, std::pair<int,int>(0, _size));
}
MemoryView& MemorySource::make(int size, int deviceID, std::string& parentUser) {
return (new MemorySource(size, deviceID))->addUser(parentUser);
}
pair<int,int> MemorySource::getRange(std::string& name) {
return _viewRanges[name];
}
int MemorySource::getSize() {
return _size;
}
bool MemorySource::truncate(std::string& name) {
bool truncated = false;
_lock.acquire();
_truncateRequests.insert(name);
if (_truncateRequests.size() == _viewRanges.size()) {
for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
delete it->second;
}
_memoryViews.clear();
_memory.truncate();
_truncateRequests.clear();
truncated = true;
}
_lock.release();
return truncated;
}

View File

@ -0,0 +1,75 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/neuron.cuh"
#include "../include/util.cuh"
using namespace std;
Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
std::string type = pyDictGetString(neuronDict, "type");
PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
if (type == "relu") {
return *new ReluNeuron();
}
if (type == "drelu") {
return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
}
if (type == "softrelu") {
return *new SoftReluNeuron();
}
if (type == "brelu") {
return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
}
if (type == "abs") {
return *new AbsNeuron();
}
if (type == "logistic") {
return *new LogisticNeuron();
}
if (type == "tanh") {
return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
}
if (type == "square") {
return *new SquareNeuron();
}
if (type == "sqrt") {
return *new SqrtNeuron();
}
if (type == "linear") {
return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
}
if (type == "log") {
return *new LogNeuron(pyDictGetFloat(neuronParamsDict, "a"));
}
if (type == "ident") {
return *new Neuron();
}
throw std::string("Unknown neuron type: ") + type;
}

Some files were not shown because too many files have changed in this diff Show More