mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Merge caffe2 with pytorch.
This commit is contained in:
0
.github/CONTRIBUTING.md
vendored
Normal file
0
.github/CONTRIBUTING.md
vendored
Normal file
5
.github/ISSUE_TEMPLATE.md
vendored
5
.github/ISSUE_TEMPLATE.md
vendored
@ -6,6 +6,7 @@ We like to limit our issues to bug reports and feature requests. If you have a q
|
||||
If you are submitting a feature request, please preface the title with [feature request].
|
||||
|
||||
When submitting a bug report, please include the following information (where relevant):
|
||||
- PyTorch or Caffe2:
|
||||
- OS:
|
||||
- PyTorch version:
|
||||
- How you installed PyTorch (conda, pip, source):
|
||||
@ -13,9 +14,11 @@ When submitting a bug report, please include the following information (where re
|
||||
- CUDA/cuDNN version:
|
||||
- GPU models and configuration:
|
||||
- GCC version (if compiling from source):
|
||||
- CMake version:
|
||||
- Build command you used (if compiling from source):
|
||||
- Versions of any other relevant libraries:
|
||||
|
||||
In addition, including the following information will also be very helpful for us to diagnose the problem:
|
||||
- A script to reproduce the bug. Please try to provide as minimal of a test case as possible.
|
||||
- Error messages and/or stack traces of the bug
|
||||
- Context around what you are trying to do
|
||||
|
||||
|
0
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
0
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
111
.gitignore
vendored
111
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
## PyTorch
|
||||
|
||||
build/
|
||||
dist/
|
||||
torch.egg-info/
|
||||
@ -60,3 +62,112 @@ test/data/linear.pt
|
||||
compile_commands.json
|
||||
*.egg-info/
|
||||
docs/source/_static/img/activation/
|
||||
|
||||
## General
|
||||
|
||||
# Compiled Object files
|
||||
*.slo
|
||||
*.lo
|
||||
*.o
|
||||
*.cuo
|
||||
*.obj
|
||||
|
||||
# Compiled Dynamic libraries
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
|
||||
# Compiled Static libraries
|
||||
*.lai
|
||||
*.la
|
||||
*.a
|
||||
*.lib
|
||||
|
||||
# Compiled protocol buffers
|
||||
*.pb.h
|
||||
*.pb.cc
|
||||
*_pb2.py
|
||||
|
||||
# Compiled python
|
||||
*.pyc
|
||||
*.pyd
|
||||
|
||||
# Compiled MATLAB
|
||||
*.mex*
|
||||
|
||||
# IPython notebook checkpoints
|
||||
.ipynb_checkpoints
|
||||
|
||||
# Editor temporaries
|
||||
*.swn
|
||||
*.swo
|
||||
*.swp
|
||||
*~
|
||||
|
||||
# Sublime Text settings
|
||||
*.sublime-workspace
|
||||
*.sublime-project
|
||||
|
||||
# Eclipse Project settings
|
||||
*.*project
|
||||
.settings
|
||||
|
||||
# QtCreator files
|
||||
*.user
|
||||
|
||||
# PyCharm files
|
||||
.idea
|
||||
|
||||
# Visual Studio Code files
|
||||
.vscode
|
||||
.vs
|
||||
|
||||
# OSX dir files
|
||||
.DS_Store
|
||||
|
||||
## Caffe2
|
||||
|
||||
# build, distribute, and bins (+ python proto bindings)
|
||||
build
|
||||
build_host_protoc
|
||||
build_android
|
||||
build_ios
|
||||
build_*
|
||||
.build_debug/*
|
||||
.build_release/*
|
||||
distribute/*
|
||||
*.testbin
|
||||
*.bin
|
||||
cmake_build
|
||||
.cmake_build
|
||||
gen
|
||||
.setuptools-cmake-build
|
||||
.pytest_cache
|
||||
|
||||
# Bram
|
||||
plsdontbreak
|
||||
|
||||
# Generated documentation
|
||||
docs/_site
|
||||
docs/gathered
|
||||
_site
|
||||
doxygen
|
||||
docs/dev
|
||||
|
||||
# LevelDB files
|
||||
*.sst
|
||||
*.ldb
|
||||
LOCK
|
||||
LOG*
|
||||
CURRENT
|
||||
MANIFEST-*
|
||||
|
||||
# generated version file
|
||||
caffe2/version.py
|
||||
|
||||
# setup.py intermediates
|
||||
.eggs
|
||||
caffe2.egg-info
|
||||
|
||||
# Atom/Watchman required file
|
||||
.watchmanconfig
|
||||
|
81
.gitmodules
vendored
81
.gitmodules
vendored
@ -1,12 +1,3 @@
|
||||
[submodule "torch/lib/gloo"]
|
||||
path = third_party/gloo
|
||||
url = https://github.com/facebookincubator/gloo
|
||||
[submodule "torch/lib/pybind11"]
|
||||
path = third_party/pybind11
|
||||
url = https://github.com/pybind/pybind11
|
||||
[submodule "torch/lib/nanopb"]
|
||||
path = third_party/nanopb
|
||||
url = https://github.com/nanopb/nanopb.git
|
||||
[submodule "aten/src/ATen/cpu/cpuinfo"]
|
||||
path = aten/src/ATen/cpu/cpuinfo
|
||||
url = https://github.com/Maratyszcza/cpuinfo
|
||||
@ -17,3 +8,75 @@
|
||||
[submodule "aten/src/ATen/utils/catch"]
|
||||
path = aten/src/ATen/utils/catch
|
||||
url = https://github.com/catchorg/Catch2.git
|
||||
[submodule "third_party/nanopb"]
|
||||
path = third_party/nanopb
|
||||
url = https://github.com/nanopb/nanopb.git
|
||||
[submodule "third_party/pybind11"]
|
||||
path = third_party/pybind11
|
||||
url = https://github.com/pybind/pybind11.git
|
||||
[submodule "third_party/nccl"]
|
||||
path = third_party/nccl
|
||||
url = https://github.com/nvidia/nccl.git
|
||||
[submodule "third_party/cub"]
|
||||
path = third_party/cub
|
||||
url = https://github.com/NVlabs/cub.git
|
||||
[submodule "third_party/eigen"]
|
||||
path = third_party/eigen
|
||||
url = https://github.com/RLovelett/eigen.git
|
||||
[submodule "third_party/googletest"]
|
||||
path = third_party/googletest
|
||||
url = https://github.com/google/googletest.git
|
||||
[submodule "third_party/nervanagpu"]
|
||||
path = third_party/nervanagpu
|
||||
url = https://github.com/NervanaSystems/nervanagpu.git
|
||||
[submodule "third_party/benchmark"]
|
||||
path = third_party/benchmark
|
||||
url = https://github.com/google/benchmark.git
|
||||
[submodule "third_party/protobuf"]
|
||||
path = third_party/protobuf
|
||||
url = https://github.com/google/protobuf.git
|
||||
[submodule "third_party/ios-cmake"]
|
||||
path = third_party/ios-cmake
|
||||
url = https://github.com/Yangqing/ios-cmake.git
|
||||
[submodule "third_party/NNPACK"]
|
||||
path = third_party/NNPACK
|
||||
url = https://github.com/Maratyszcza/NNPACK.git
|
||||
[submodule "third_party/gloo"]
|
||||
path = third_party/gloo
|
||||
url = https://github.com/facebookincubator/gloo
|
||||
[submodule "third_party/NNPACK_deps/pthreadpool"]
|
||||
path = third_party/pthreadpool
|
||||
url = https://github.com/Maratyszcza/pthreadpool.git
|
||||
[submodule "third_party/NNPACK_deps/FXdiv"]
|
||||
path = third_party/FXdiv
|
||||
url = https://github.com/Maratyszcza/FXdiv.git
|
||||
[submodule "third_party/NNPACK_deps/FP16"]
|
||||
path = third_party/FP16
|
||||
url = https://github.com/Maratyszcza/FP16.git
|
||||
[submodule "third_party/NNPACK_deps/psimd"]
|
||||
path = third_party/psimd
|
||||
url = https://github.com/Maratyszcza/psimd.git
|
||||
[submodule "third_party/aten"]
|
||||
path = third_party/aten
|
||||
url = https://github.com/zdevito/aten
|
||||
[submodule "third_party/zstd"]
|
||||
path = third_party/zstd
|
||||
url = https://github.com/facebook/zstd.git
|
||||
[submodule "third-party/cpuinfo"]
|
||||
path = third_party/cpuinfo
|
||||
url = https://github.com/Maratyszcza/cpuinfo.git
|
||||
[submodule "third_party/python-enum"]
|
||||
path = third_party/python-enum
|
||||
url = https://github.com/PeachPy/enum34.git
|
||||
[submodule "third_party/python-peachpy"]
|
||||
path = third_party/python-peachpy
|
||||
url = https://github.com/Maratyszcza/PeachPy.git
|
||||
[submodule "third_party/python-six"]
|
||||
path = third_party/python-six
|
||||
url = https://github.com/benjaminp/six.git
|
||||
[submodule "third_party/ComputeLibrary"]
|
||||
path = third_party/ComputeLibrary
|
||||
url = https://github.com/ARM-software/ComputeLibrary.git
|
||||
[submodule "third_party/onnx"]
|
||||
path = third_party/onnx
|
||||
url = https://github.com/onnx/onnx.git
|
||||
|
14
.jenkins/caffe2/README.md
Normal file
14
.jenkins/caffe2/README.md
Normal file
@ -0,0 +1,14 @@
|
||||
# Jenkins
|
||||
|
||||
The scripts in this directory are the entrypoint for testing Caffe2.
|
||||
|
||||
The environment variable `BUILD_ENVIRONMENT` is expected to be set to
|
||||
the build environment you intend to test. It is a hint for the build
|
||||
and test scripts to configure Caffe2 a certain way and include/exclude
|
||||
tests. Docker images, they equal the name of the image itself. For
|
||||
example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
|
||||
built on Jenkins and are used in triggered builds already have this
|
||||
environment variable set in their manifest. Also see
|
||||
`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
|
||||
|
||||
Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
|
185
.jenkins/caffe2/build.sh
Executable file
185
.jenkins/caffe2/build.sh
Executable file
@ -0,0 +1,185 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
|
||||
|
||||
# Setup sccache if SCCACHE_BUCKET is set
|
||||
if [ -n "${SCCACHE_BUCKET}" ]; then
|
||||
mkdir -p ./sccache
|
||||
|
||||
SCCACHE="$(which sccache)"
|
||||
if [ -z "${SCCACHE}" ]; then
|
||||
echo "Unable to find sccache..."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Setup wrapper scripts
|
||||
for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do
|
||||
(
|
||||
echo "#!/bin/sh"
|
||||
echo "exec $SCCACHE $(which $compiler) \"\$@\""
|
||||
) > "./sccache/$compiler"
|
||||
chmod +x "./sccache/$compiler"
|
||||
done
|
||||
|
||||
# CMake must find these wrapper scripts
|
||||
export PATH="$PWD/sccache:$PATH"
|
||||
fi
|
||||
|
||||
# Setup ccache if configured to use it (and not sccache)
|
||||
if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then
|
||||
mkdir -p ./ccache
|
||||
ln -sf "$(which ccache)" ./ccache/cc
|
||||
ln -sf "$(which ccache)" ./ccache/c++
|
||||
ln -sf "$(which ccache)" ./ccache/gcc
|
||||
ln -sf "$(which ccache)" ./ccache/g++
|
||||
ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc
|
||||
export CCACHE_WRAPPER_DIR="$PWD/ccache"
|
||||
export PATH="$CCACHE_WRAPPER_DIR:$PATH"
|
||||
fi
|
||||
|
||||
CMAKE_ARGS=("-DBUILD_BINARY=ON")
|
||||
CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
|
||||
CMAKE_ARGS+=("-DUSE_ZSTD=ON")
|
||||
|
||||
# Run build script from scripts if applicable
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
|
||||
export ANDROID_NDK=/opt/ndk
|
||||
"${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@"
|
||||
exit 0
|
||||
fi
|
||||
if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
|
||||
|
||||
# click (required by onnx) wants these set
|
||||
export LANG=C.UTF-8
|
||||
export LC_ALL=C.UTF-8
|
||||
|
||||
# SKIP_CONDA_TESTS refers to only the 'test' section of the meta.yaml
|
||||
export SKIP_CONDA_TESTS=1
|
||||
export CONDA_INSTALL_LOCALLY=1
|
||||
"${ROOT_DIR}/scripts/build_anaconda.sh" "$@"
|
||||
|
||||
# The tests all need hypothesis, tabulate, and pydot, which aren't included
|
||||
# in the conda packages
|
||||
conda install -y hypothesis tabulate pydot
|
||||
|
||||
# This build will be tested against onnx tests, which needs onnx installed.
|
||||
# Onnx should be built against the same protobuf that Caffe2 uses, which is
|
||||
# only installed in the conda environment when Caffe2 is.
|
||||
# This path comes from install_anaconda.sh which installs Anaconda into the
|
||||
# docker image
|
||||
PROTOBUF_INCDIR=/opt/conda/include pip install "${ROOT_DIR}/third_party/onnx"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Run cmake from ./build directory
|
||||
mkdir -p ./build
|
||||
cd ./build
|
||||
|
||||
INSTALL_PREFIX="/usr/local/caffe2"
|
||||
CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
|
||||
|
||||
# Explicitly set Python executable.
|
||||
# On Ubuntu 16.04 the default Python is still 2.7.
|
||||
PYTHON="$(which python)"
|
||||
if [[ "${BUILD_ENVIRONMENT}" == py3* ]]; then
|
||||
PYTHON=/usr/bin/python3
|
||||
CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
|
||||
fi
|
||||
|
||||
case "${BUILD_ENVIRONMENT}" in
|
||||
*-mkl*)
|
||||
CMAKE_ARGS+=("-DBLAS=MKL")
|
||||
;;
|
||||
*-cuda*)
|
||||
CMAKE_ARGS+=("-DUSE_CUDA=ON")
|
||||
CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
|
||||
CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
|
||||
|
||||
# Add ccache symlink for nvcc
|
||||
ln -sf "$(which ccache)" "${CCACHE_WRAPPER_DIR}/nvcc"
|
||||
|
||||
# Explicitly set path to NVCC such that the symlink to ccache is used
|
||||
CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CCACHE_WRAPPER_DIR}/nvcc")
|
||||
|
||||
# Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
|
||||
# Setting PATH to resolve to the right nvcc alone isn't enough.
|
||||
# See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
|
||||
export CUDA_PATH="/usr/local/cuda"
|
||||
|
||||
# Ensure the ccache symlink can still find the real nvcc binary.
|
||||
export PATH="/usr/local/cuda/bin:$PATH"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Try to include Redis support for Linux builds
|
||||
if [ "$(uname)" == "Linux" ]; then
|
||||
CMAKE_ARGS+=("-DUSE_REDIS=ON")
|
||||
fi
|
||||
|
||||
# Currently, on Jenkins mac os, we will use custom protobuf. Mac OS
|
||||
# contbuild at the moment is minimal dependency - it doesn't use glog
|
||||
# or gflags either.
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON")
|
||||
fi
|
||||
|
||||
# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
|
||||
# and use that if so.
|
||||
if [[ -x "$(command -v cmake3)" ]]; then
|
||||
CMAKE_BINARY=cmake3
|
||||
else
|
||||
CMAKE_BINARY=cmake
|
||||
fi
|
||||
|
||||
# Configure
|
||||
${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
|
||||
|
||||
# Build
|
||||
if [ "$(uname)" == "Linux" ]; then
|
||||
make "-j$(nproc)" install
|
||||
else
|
||||
echo "Don't know how to build on $(uname)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Install ONNX into a local directory
|
||||
ONNX_INSTALL_PATH="/usr/local/onnx"
|
||||
pip install "${ROOT_DIR}/third_party/onnx" -t "${ONNX_INSTALL_PATH}"
|
||||
|
||||
# Symlink the caffe2 base python path into the system python path,
|
||||
# so that we can import caffe2 without having to change $PYTHONPATH.
|
||||
# Run in a subshell to contain environment set by /etc/os-release.
|
||||
#
|
||||
# This is only done when running on Jenkins! We don't want to pollute
|
||||
# the user environment with Python symlinks and ld.so.conf.d hacks.
|
||||
#
|
||||
if [ -n "${JENKINS_URL}" ]; then
|
||||
(
|
||||
source /etc/os-release
|
||||
|
||||
function python_version() {
|
||||
"$PYTHON" -c 'import sys; print("python%d.%d" % sys.version_info[0:2])'
|
||||
}
|
||||
|
||||
# Debian/Ubuntu
|
||||
if [[ "$ID_LIKE" == *debian* ]]; then
|
||||
python_path="/usr/local/lib/$(python_version)/dist-packages"
|
||||
sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
|
||||
sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}"
|
||||
fi
|
||||
|
||||
# RHEL/CentOS
|
||||
if [[ "$ID_LIKE" == *rhel* ]]; then
|
||||
python_path="/usr/lib64/$(python_version)/site-packages/"
|
||||
sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
|
||||
sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}"
|
||||
fi
|
||||
|
||||
# /etc/ld.so.conf.d is used on both Debian and RHEL
|
||||
echo "${INSTALL_PREFIX}/lib" | sudo tee /etc/ld.so.conf.d/caffe2.conf
|
||||
sudo ldconfig
|
||||
)
|
||||
fi
|
132
.jenkins/caffe2/test.sh
Executable file
132
.jenkins/caffe2/test.sh
Executable file
@ -0,0 +1,132 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
|
||||
|
||||
# Figure out which Python to use
|
||||
PYTHON="python"
|
||||
if [ -n "$BUILD_ENVIRONMENT" ]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == py2* ]]; then
|
||||
PYTHON="python2"
|
||||
elif [[ "$BUILD_ENVIRONMENT" == py3* ]]; then
|
||||
PYTHON="python3"
|
||||
fi
|
||||
fi
|
||||
|
||||
# The prefix must mirror the setting from build.sh
|
||||
INSTALL_PREFIX="/usr/local/caffe2"
|
||||
|
||||
# Anaconda builds have a special install prefix and python
|
||||
if [[ "$BUILD_ENVIRONMENT" == conda* ]]; then
|
||||
# This path comes from install_anaconda.sh which installs Anaconda into the
|
||||
# docker image
|
||||
PYTHON="/opt/conda/bin/python"
|
||||
INSTALL_PREFIX="/opt/conda/"
|
||||
|
||||
# Testing requires separate packages
|
||||
if [[ $BUILD_ENVIRONMENT == *gcc4* ]]; then
|
||||
# These are from conda-forge
|
||||
conda install -yc conda-forge hypothesis tabulate pydot networkx==2.0 click pytest scipy
|
||||
# These packages are from the default channels
|
||||
conda install -y opencv=3.1.0=np112py27_1 pil=1.1.7=py27_2
|
||||
else
|
||||
conda install -y hypothesis tabulate pydot
|
||||
fi
|
||||
|
||||
# This build will be tested against onnx tests, which needs onnx installed.
|
||||
# Onnx should be built against the same protobuf that Caffe2 uses, which is
|
||||
# only installed in the conda environment when Caffe2 is.
|
||||
# This path comes from install_anaconda.sh which installs Anaconda into the
|
||||
# docker image
|
||||
PROTOBUF_INCDIR=/opt/conda/include pip install "${ROOT_DIR}/third_party/onnx"
|
||||
fi
|
||||
|
||||
# Add the site-packages in the caffe2 install prefix to the PYTHONPATH
|
||||
SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))")
|
||||
INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}"
|
||||
|
||||
# Skip tests in environments where they are not built/applicable
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
|
||||
echo 'Skipping tests'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed
|
||||
# Caffe2. This shouldn't be done on Anaconda, as Anaconda should handle this.
|
||||
if [[ "$BUILD_ENVIRONMENT" != conda* ]]; then
|
||||
export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR"
|
||||
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib"
|
||||
fi
|
||||
|
||||
exit_code=0
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
if [ -d ./test ]; then
|
||||
echo "Directory ./test already exists; please remove it..."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p ./test/{cpp,python}
|
||||
TEST_DIR="$PWD/test"
|
||||
|
||||
cd ${INSTALL_PREFIX}
|
||||
|
||||
# Commands below may exit with non-zero status
|
||||
set +e
|
||||
|
||||
# C++ tests
|
||||
echo "Running C++ tests.."
|
||||
for test in ./test/*; do
|
||||
# Skip tests we know are hanging or bad
|
||||
case "$(basename "$test")" in
|
||||
mkl_utils_test)
|
||||
continue
|
||||
;;
|
||||
# TODO investigate conv_op_test failures when using MKL
|
||||
conv_op_test)
|
||||
continue
|
||||
;;
|
||||
esac
|
||||
|
||||
"$test" --gtest_output=xml:"$TEST_DIR"/cpp/$(basename "$test").xml
|
||||
tmp_exit_code="$?"
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
exit_code="$tmp_exit_code"
|
||||
fi
|
||||
done
|
||||
|
||||
# Get the relative path to where the caffe2 python module was installed
|
||||
CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2"
|
||||
|
||||
# Collect additional tests to run (outside caffe2/python)
|
||||
EXTRA_TESTS=()
|
||||
|
||||
# CUDA builds always include NCCL support
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-cuda* ]]; then
|
||||
EXTRA_TESTS+=("$CAFFE2_PYPATH/contrib/nccl")
|
||||
fi
|
||||
|
||||
# Python tests
|
||||
echo "Running Python tests.."
|
||||
"$PYTHON" \
|
||||
-m pytest \
|
||||
-x \
|
||||
-v \
|
||||
--junit-xml="$TEST_DIR/python/result.xml" \
|
||||
--ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \
|
||||
--ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \
|
||||
--ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \
|
||||
--ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \
|
||||
"$CAFFE2_PYPATH/python" \
|
||||
"${EXTRA_TESTS[@]}"
|
||||
|
||||
tmp_exit_code="$?"
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
exit_code="$tmp_exit_code"
|
||||
fi
|
||||
|
||||
# Exit with the first non-zero status we got
|
||||
exit "$exit_code"
|
286
CMakeLists.txt
Normal file
286
CMakeLists.txt
Normal file
@ -0,0 +1,286 @@
|
||||
cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
|
||||
#cmake_policy(SET CMP0022 NEW)
|
||||
#cmake_policy(SET CMP0023 NEW)
|
||||
|
||||
# ---[ Project and semantic versioning.
|
||||
project(Caffe2 CXX C)
|
||||
|
||||
set(CAFFE2_VERSION_MAJOR 0)
|
||||
set(CAFFE2_VERSION_MINOR 8)
|
||||
set(CAFFE2_VERSION_PATCH 2)
|
||||
set(CAFFE2_VERSION
|
||||
"${CAFFE2_VERSION_MAJOR}.${CAFFE2_VERSION_MINOR}.${CAFFE2_VERSION_PATCH}")
|
||||
|
||||
# One variable that determines whether the current cmake process is being run
|
||||
# with the main Caffe2 library. This is useful for building modules - if
|
||||
# modules are built with the main Caffe2 library then one does not need to do
|
||||
# find caffe2 in the cmake script. One can usually guard it in some way like
|
||||
# if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
|
||||
# find_package(Caffe2 REQUIRED)
|
||||
# endif()
|
||||
set(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO ON)
|
||||
|
||||
# ---[ Options.
|
||||
# Note to developers: if you add an option below, make sure you also add it to
|
||||
# cmake/Summary.cmake so that the summary prints out the option values.
|
||||
include(CMakeDependentOption)
|
||||
option(BUILD_BINARY "Build C++ binaries" ON)
|
||||
option(BUILD_DOCS "Build documentation" OFF)
|
||||
option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" OFF)
|
||||
option(BUILD_PYTHON "Build Python binaries" ON)
|
||||
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
|
||||
cmake_dependent_option(
|
||||
CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
|
||||
"BUILD_SHARED_LIBS AND BUILD_CUSTOM_PROTOBUF" OFF)
|
||||
cmake_dependent_option(
|
||||
CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
|
||||
"NOT BUILD_SHARED_LIBS" OFF)
|
||||
option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" ON)
|
||||
option(USE_ACL "Use ARM Compute Library" OFF)
|
||||
option(USE_ASAN "Use Address Sanitizer" OFF)
|
||||
option(USE_ATEN "Use ATen" OFF)
|
||||
option(USE_CUDA "Use Cuda" ON)
|
||||
option(USE_FFMPEG "Use ffmpeg" OFF)
|
||||
option(USE_GFLAGS "Use GFLAGS" ON)
|
||||
option(USE_GLOG "Use GLOG" ON)
|
||||
option(USE_GLOO "Use Gloo" ON)
|
||||
option(USE_LEVELDB "Use LEVELDB" ON)
|
||||
option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
|
||||
option(USE_LMDB "Use LMDB" ON)
|
||||
option(USE_METAL "Use Metal for iOS build" ON)
|
||||
option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
|
||||
option(USE_MPI "Use MPI" ON)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
option(USE_NCCL "Use NCCL" ON)
|
||||
option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF)
|
||||
option(USE_NNAPI "Use NNAPI" OFF)
|
||||
option(USE_NNPACK "Use NNPACK" ON)
|
||||
option(USE_NUMA "Use NUMA (only available on Linux)" ON)
|
||||
option(USE_OBSERVERS "Use observers module." OFF)
|
||||
option(USE_OPENCV "Use openCV" ON)
|
||||
option(USE_OPENMP "Use OpenMP for parallel code" OFF)
|
||||
option(USE_PROF "Use profiling" OFF)
|
||||
option(USE_REDIS "Use Redis" OFF)
|
||||
option(USE_ROCKSDB "Use RocksDB" OFF)
|
||||
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
|
||||
option(USE_ZMQ "Use ZMQ" OFF)
|
||||
option(USE_ZSTD "Use ZSTD" OFF)
|
||||
|
||||
# ---[ CMake scripts + modules
|
||||
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
|
||||
|
||||
if (MSVC AND ${BUILD_SHARED_LIBS})
|
||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
endif()
|
||||
|
||||
# ---[ CMake build directories
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
|
||||
enable_testing()
|
||||
|
||||
# ---[ Misc checks to cope with various compiler modes
|
||||
include(cmake/MiscCheck.cmake)
|
||||
include(cmake/BuildVariables.cmake)
|
||||
|
||||
# External projects
|
||||
include(ExternalProject)
|
||||
|
||||
# TODO: merge the following 3 files into cmake/public/utils.cmake.
|
||||
include(cmake/Utils.cmake)
|
||||
include(cmake/public/utils.cmake)
|
||||
|
||||
set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.")
|
||||
|
||||
# Set default build type
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
message(STATUS "Build type not set - defaulting to Release")
|
||||
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE)
|
||||
endif()
|
||||
|
||||
# ---[ Dependencies
|
||||
include(cmake/Dependencies.cmake)
|
||||
|
||||
# ---[ Whitelist file if whitelist is specified
|
||||
include(cmake/Whitelist.cmake)
|
||||
|
||||
# ---[ Set link flag, handle additional deps for gcc 4.8 and above
|
||||
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.8.0 AND NOT ANDROID)
|
||||
message(STATUS "GCC ${CMAKE_CXX_COMPILER_VERSION}: Adding gcc and gcc_s libs to link line")
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS gcc_s gcc)
|
||||
endif()
|
||||
|
||||
# ---[ Build flags
|
||||
set(CMAKE_C_STANDARD 99)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
if(NOT MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fPIC")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||
# Eigen fails to build with some versions, so convert this to a warning
|
||||
# Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization")
|
||||
else()
|
||||
foreach(flag_var
|
||||
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
|
||||
if (${CAFFE2_USE_MSVC_STATIC_RUNTIME})
|
||||
if(${flag_var} MATCHES "/MD")
|
||||
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
|
||||
endif(${flag_var} MATCHES "/MD")
|
||||
else()
|
||||
if(${flag_var} MATCHES "/MT")
|
||||
string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
|
||||
endif()
|
||||
endif()
|
||||
set(${flag_var} "${${flag_var}} /MP /bigobj")
|
||||
endforeach(flag_var)
|
||||
endif()
|
||||
|
||||
if(ANDROID)
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s")
|
||||
else()
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT APPLE AND UNIX)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS dl)
|
||||
endif()
|
||||
|
||||
# Prefix path to Caffe2 headers.
|
||||
# If a directory containing installed Caffe2 headers was inadvertently
|
||||
# added to the list of include directories, prefixing
|
||||
# PROJECT_SOURCE_DIR means this source tree always takes precedence.
|
||||
include_directories(BEFORE ${PROJECT_SOURCE_DIR})
|
||||
|
||||
# Prefix path to generated Caffe2 headers.
|
||||
# These need to take precedence over their empty counterparts located
|
||||
# in PROJECT_SOURCE_DIR.
|
||||
include_directories(BEFORE ${PROJECT_BINARY_DIR})
|
||||
|
||||
# ---[ Old caffe protobuf.
|
||||
add_subdirectory(caffe/proto)
|
||||
|
||||
# ---[ Main build
|
||||
add_subdirectory(caffe2)
|
||||
|
||||
# Documentation Option
|
||||
if(BUILD_DOCS)
|
||||
# check if Doxygen is installed
|
||||
find_package(Doxygen)
|
||||
if (DOXYGEN_FOUND)
|
||||
message("Generating documentation")
|
||||
|
||||
set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
|
||||
set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
|
||||
set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
|
||||
set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
|
||||
|
||||
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
|
||||
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
|
||||
endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
|
||||
|
||||
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
|
||||
configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
|
||||
configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
|
||||
|
||||
add_custom_target(doc_doxygen_c ALL
|
||||
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "Generating C++ API documentation with Doxygen"
|
||||
VERBATIM)
|
||||
|
||||
add_custom_target(doc_doxygen_python ALL
|
||||
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "Generating Python API documentation with Doxygen"
|
||||
VERBATIM)
|
||||
else (DOXYGEN_FOUND)
|
||||
message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
|
||||
endif (DOXYGEN_FOUND)
|
||||
endif (BUILD_DOCS)
|
||||
|
||||
# ---[ CMake related files
|
||||
# Uninistall option.
|
||||
if(NOT TARGET caffe2_uninstall)
|
||||
configure_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
|
||||
IMMEDIATE @ONLY)
|
||||
|
||||
add_custom_target(caffe2_uninstall
|
||||
COMMAND ${CMAKE_COMMAND} -P
|
||||
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
|
||||
endif()
|
||||
|
||||
# ---[ Make configuration files for cmake to allow dependent libraries
|
||||
# easier access to Caffe2.
|
||||
|
||||
if ((NOT USE_GLOG) OR (NOT USE_GFLAGS) OR BUILD_CUSTOM_PROTOBUF)
|
||||
message(WARNING
|
||||
"Generated cmake files are only fully tested if one builds "
|
||||
"with system glog, gflags, and protobuf. Other settings may "
|
||||
"generate files that are not well tested.")
|
||||
endif()
|
||||
|
||||
if (USE_CUDA)
|
||||
# TODO: check if we should include other cuda dependency libraries
|
||||
# to the interface as well.
|
||||
|
||||
endif()
|
||||
|
||||
# Note(jiayq): when building static libraries, all PRIVATE dependencies
|
||||
# will also become interface libraries, and as a result if there are any
|
||||
# dependency libraries that are not exported, the following install export
|
||||
# script will fail. As a result, we will only provide the targets cmake
|
||||
# files for shared lib installation. For more info, read:
|
||||
# https://cmake.org/pipermail/cmake/2016-May/063400.html
|
||||
if (BUILD_SHARED_LIBS)
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/cmake/Caffe2ConfigVersion.cmake.in
|
||||
${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
|
||||
@ONLY)
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/cmake/Caffe2Config.cmake.in
|
||||
${PROJECT_BINARY_DIR}/Caffe2Config.cmake
|
||||
@ONLY)
|
||||
install(FILES
|
||||
${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
|
||||
${PROJECT_BINARY_DIR}/Caffe2Config.cmake
|
||||
DESTINATION share/cmake/Caffe2
|
||||
COMPONENT dev)
|
||||
install(FILES
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
|
||||
DESTINATION share/cmake/Caffe2/public
|
||||
COMPONENT dev)
|
||||
install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
|
||||
FILE Caffe2Targets.cmake
|
||||
COMPONENT dev)
|
||||
else()
|
||||
message(WARNING
|
||||
"Generated cmake files are only available when building "
|
||||
"shared libs.")
|
||||
endif()
|
||||
|
||||
# ---[ Modules
|
||||
add_subdirectory(modules)
|
||||
|
||||
# ---[ Binaries
|
||||
# Binaries will be built after the Caffe2 main libraries and the modules
|
||||
# are built. For the binaries, they will be linked to the Caffe2 main
|
||||
# libraries, as well as all the modules that are built with Caffe2 (the ones
|
||||
# built in the previous Modules section above).
|
||||
|
||||
if (BUILD_BINARY)
|
||||
add_subdirectory(binaries)
|
||||
endif()
|
||||
|
||||
include(cmake/Summary.cmake)
|
||||
caffe2_print_configuration_summary()
|
10
LICENSE
10
LICENSE
@ -1,13 +1,3 @@
|
||||
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
||||
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
||||
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
||||
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
||||
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
||||
Copyright (c) 2011-2013 NYU (Clement Farabet)
|
||||
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
||||
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
||||
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
21
Makefile
Normal file
21
Makefile
Normal file
@ -0,0 +1,21 @@
|
||||
# This makefile does nothing but delegating the actual building to cmake.
|
||||
|
||||
all:
|
||||
@mkdir -p build && cd build && cmake .. $(shell python ./scripts/get_python_cmake_flags.py) && $(MAKE)
|
||||
|
||||
local:
|
||||
@./scripts/build_local.sh
|
||||
|
||||
android:
|
||||
@./scripts/build_android.sh
|
||||
|
||||
ios:
|
||||
@./scripts/build_ios.sh
|
||||
|
||||
clean: # This will remove ALL build folders.
|
||||
@rm -r build*/
|
||||
|
||||
linecount:
|
||||
@cloc --read-lang-def=caffe.cloc caffe2 || \
|
||||
echo "Cloc is not available on the machine. You can install cloc with " && \
|
||||
echo " sudo apt-get install cloc"
|
351
NOTICE
Normal file
351
NOTICE
Normal file
@ -0,0 +1,351 @@
|
||||
From PyTorch:
|
||||
|
||||
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
||||
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
||||
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
||||
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
||||
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
||||
Copyright (c) 2011-2013 NYU (Clement Farabet)
|
||||
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
||||
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
||||
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
||||
|
||||
From Caffe2:
|
||||
|
||||
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
|
||||
|
||||
All contributions by Facebook:
|
||||
Copyright (c) 2016 Facebook Inc.
|
||||
|
||||
All contributions by Google:
|
||||
Copyright (c) 2015 Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
All contributions by Yangqing Jia:
|
||||
Copyright (c) 2015 Yangqing Jia
|
||||
All rights reserved.
|
||||
|
||||
All contributions from Caffe:
|
||||
Copyright(c) 2013, 2014, 2015, the respective contributors
|
||||
All rights reserved.
|
||||
|
||||
All other contributions:
|
||||
Copyright(c) 2015, 2016 the respective contributors
|
||||
All rights reserved.
|
||||
|
||||
Caffe2 uses a copyright model similar to Caffe: each contributor holds
|
||||
copyright over their contributions to Caffe2. The project versioning records
|
||||
all such contribution and copyright details. If a contributor wants to further
|
||||
mark their specific copyright on a particular contribution, they should
|
||||
indicate their copyright solely in the commit message of the change when it is
|
||||
committed.
|
||||
|
||||
=======================================================================
|
||||
Software under third_party
|
||||
=======================================================================
|
||||
Software libraries under third_party are provided as github submodule
|
||||
links, and their content is not part of the Caffe2 codebase. Their
|
||||
licences can be found under the respective software repositories.
|
||||
|
||||
=======================================================================
|
||||
Earlier BSD License
|
||||
=======================================================================
|
||||
Early development of Caffe2 in 2015 and early 2016 is licensed under the
|
||||
BSD license. The license is attached below:
|
||||
|
||||
All contributions by Facebook:
|
||||
Copyright (c) 2016 Facebook Inc.
|
||||
|
||||
All contributions by Google:
|
||||
Copyright (c) 2015 Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
All contributions by Yangqing Jia:
|
||||
Copyright (c) 2015 Yangqing Jia
|
||||
All rights reserved.
|
||||
|
||||
All other contributions:
|
||||
Copyright(c) 2015, 2016 the respective contributors
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
=======================================================================
|
||||
Caffe's BSD License
|
||||
=======================================================================
|
||||
Some parts of the caffe2 code is derived from the original Caffe code, which is
|
||||
created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe
|
||||
license is as follows:
|
||||
|
||||
COPYRIGHT
|
||||
|
||||
All contributions by the University of California:
|
||||
Copyright (c) 2014, The Regents of the University of California (Regents)
|
||||
All rights reserved.
|
||||
|
||||
All other contributions:
|
||||
Copyright (c) 2014, the respective contributors
|
||||
All rights reserved.
|
||||
|
||||
Caffe uses a shared copyright model: each contributor holds copyright over
|
||||
their contributions to Caffe. The project versioning records all such
|
||||
contribution and copyright details. If a contributor wants to further mark
|
||||
their specific copyright on a particular contribution, they should indicate
|
||||
their copyright solely in the commit message of the change when it is
|
||||
committed.
|
||||
|
||||
LICENSE
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
CONTRIBUTION AGREEMENT
|
||||
|
||||
By contributing to the BVLC/caffe repository through pull-request, comment,
|
||||
or otherwise, the contributor releases their content to the
|
||||
license and copyright terms herein.
|
||||
|
||||
=======================================================================
|
||||
Caffe2's Apache License
|
||||
=======================================================================
|
||||
|
||||
This repo contains Caffe2 code, which was previously licensed under
|
||||
Apache License Version 2.0:
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
52
binaries/CMakeLists.txt
Normal file
52
binaries/CMakeLists.txt
Normal file
@ -0,0 +1,52 @@
|
||||
caffe2_binary_target("convert_caffe_image_db.cc")
|
||||
caffe2_binary_target("convert_db.cc")
|
||||
caffe2_binary_target("make_cifar_db.cc")
|
||||
caffe2_binary_target("make_mnist_db.cc")
|
||||
caffe2_binary_target("predictor_verifier.cc")
|
||||
caffe2_binary_target("print_registered_core_operators.cc")
|
||||
caffe2_binary_target("run_plan.cc")
|
||||
caffe2_binary_target("speed_benchmark.cc")
|
||||
caffe2_binary_target("split_db.cc")
|
||||
|
||||
caffe2_binary_target("db_throughput.cc")
|
||||
|
||||
if (USE_CUDA)
|
||||
caffe2_binary_target("inspect_gpus.cc")
|
||||
target_link_libraries(inspect_gpus ${CUDA_LIBRARIES})
|
||||
caffe2_binary_target("print_core_object_sizes.cc")
|
||||
|
||||
if (BUILD_TEST)
|
||||
# Core overhead benchmark
|
||||
caffe2_binary_target("core_overhead_benchmark.cc")
|
||||
target_link_libraries(core_overhead_benchmark benchmark ${CUDA_curand_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (USE_ZMQ)
|
||||
caffe2_binary_target("zmq_feeder.cc")
|
||||
target_link_libraries(zmq_feeder ${ZMQ_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if(USE_MPI)
|
||||
caffe2_binary_target("run_plan_mpi.cc")
|
||||
target_link_libraries(run_plan_mpi ${MPI_CXX_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if (USE_OPENCV AND USE_LEVELDB)
|
||||
caffe2_binary_target("convert_encoded_to_raw_leveldb.cc")
|
||||
target_link_libraries(
|
||||
convert_encoded_to_raw_leveldb
|
||||
${OpenCV_LIBS} ${LevelDB_LIBRARIES} ${Snappy_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if (USE_OPENCV)
|
||||
caffe2_binary_target("make_image_db.cc")
|
||||
target_link_libraries(make_image_db ${OpenCV_LIBS})
|
||||
endif()
|
||||
|
||||
if (USE_OBSERVERS)
|
||||
caffe2_binary_target("caffe2_benchmark.cc")
|
||||
endif()
|
||||
|
||||
# ---[ tutorials
|
||||
caffe2_binary_target("tutorial_blob.cc")
|
241
binaries/caffe2_benchmark.cc
Normal file
241
binaries/caffe2_benchmark.cc
Normal file
@ -0,0 +1,241 @@
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/blob_serialization.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "caffe2/utils/string_utils.h"
|
||||
|
||||
#include "observers/observer_config.h"
|
||||
|
||||
CAFFE2_DEFINE_string(
|
||||
backend,
|
||||
"builtin",
|
||||
"The backend to use when running the model. The allowed "
|
||||
"backend choices are: builtin, default, nnpack, eigen, mkl");
|
||||
CAFFE2_DEFINE_string(
|
||||
init_net,
|
||||
"",
|
||||
"The given net to initialize any parameters.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input,
|
||||
"",
|
||||
"Input that is needed for running the network. If "
|
||||
"multiple input needed, use comma separated string.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input_dims,
|
||||
"",
|
||||
"Alternate to input_files, if all inputs are simple "
|
||||
"float TensorCPUs, specify the dimension using comma "
|
||||
"separated numbers. If multiple input needed, use "
|
||||
"semicolon to separate the dimension of different "
|
||||
"tensors.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input_file,
|
||||
"",
|
||||
"Input file that contain the serialized protobuf for "
|
||||
"the input blobs. If multiple input needed, use comma "
|
||||
"separated string. Must have the same number of items "
|
||||
"as input does.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input_type,
|
||||
"float",
|
||||
"Input type when specifying the input dimension."
|
||||
"The supported types are float, uint8_t.");
|
||||
CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
|
||||
CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
|
||||
CAFFE2_DEFINE_string(
|
||||
output,
|
||||
"",
|
||||
"Output that should be dumped after the execution "
|
||||
"finishes. If multiple outputs are needed, use comma "
|
||||
"separated string. If you want to dump everything, pass "
|
||||
"'*' as the output value.");
|
||||
CAFFE2_DEFINE_string(
|
||||
output_folder,
|
||||
"",
|
||||
"The folder that the output should be written to. This "
|
||||
"folder must already exist in the file system.");
|
||||
CAFFE2_DEFINE_bool(
|
||||
run_individual,
|
||||
false,
|
||||
"Whether to benchmark individual operators.");
|
||||
CAFFE2_DEFINE_bool(
|
||||
text_output,
|
||||
false,
|
||||
"Whether to write out output in text format for regression purpose.");
|
||||
CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
|
||||
|
||||
using std::string;
|
||||
using std::unique_ptr;
|
||||
using std::vector;
|
||||
|
||||
static void writeTextOutput(
|
||||
caffe2::TensorCPU* tensor,
|
||||
const string& output_prefix,
|
||||
const string& name) {
|
||||
string output_name = output_prefix + "/" + name + ".txt";
|
||||
caffe2::TensorSerializer<caffe2::CPUContext> ser;
|
||||
caffe2::BlobProto blob_proto;
|
||||
ser.Serialize(
|
||||
*tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
|
||||
blob_proto.set_name(output_name);
|
||||
blob_proto.set_type("Tensor");
|
||||
CAFFE_ENFORCE(blob_proto.has_tensor());
|
||||
caffe2::TensorProto tensor_proto = blob_proto.tensor();
|
||||
vector<float> data;
|
||||
switch (tensor_proto.data_type()) {
|
||||
case caffe2::TensorProto::FLOAT: {
|
||||
std::copy(
|
||||
tensor_proto.float_data().begin(),
|
||||
tensor_proto.float_data().end(),
|
||||
std::back_inserter(data));
|
||||
break;
|
||||
}
|
||||
case caffe2::TensorProto::INT32: {
|
||||
std::copy(
|
||||
tensor_proto.int32_data().begin(),
|
||||
tensor_proto.int32_data().end(),
|
||||
std::back_inserter(data));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
CAFFE_THROW("Unimplemented Blob type.");
|
||||
}
|
||||
std::ofstream output_file(output_name);
|
||||
std::ostream_iterator<float> output_iterator(output_file, "\n");
|
||||
std::copy(data.begin(), data.end(), output_iterator);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::ShowLogInfoToStderr();
|
||||
unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
|
||||
|
||||
// Run initialization network.
|
||||
caffe2::NetDef init_net_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def));
|
||||
CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
|
||||
|
||||
// Load input.
|
||||
if (caffe2::FLAGS_input.size()) {
|
||||
vector<string> input_names = caffe2::split(',', caffe2::FLAGS_input);
|
||||
if (caffe2::FLAGS_input_file.size()) {
|
||||
vector<string> input_files = caffe2::split(',', caffe2::FLAGS_input_file);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
input_names.size(),
|
||||
input_files.size(),
|
||||
"Input name and file should have the same number.");
|
||||
for (int i = 0; i < input_names.size(); ++i) {
|
||||
caffe2::BlobProto blob_proto;
|
||||
CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
|
||||
workspace->CreateBlob(input_names[i])->Deserialize(blob_proto);
|
||||
}
|
||||
} else if (caffe2::FLAGS_input_dims.size()) {
|
||||
vector<string> input_dims_list =
|
||||
caffe2::split(';', caffe2::FLAGS_input_dims);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
input_names.size(),
|
||||
input_dims_list.size(),
|
||||
"Input name and dims should have the same number of items.");
|
||||
for (int i = 0; i < input_names.size(); ++i) {
|
||||
vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
|
||||
vector<int> input_dims;
|
||||
for (const string& s : input_dims_str) {
|
||||
input_dims.push_back(caffe2::stoi(s));
|
||||
}
|
||||
if (!workspace->HasBlob(input_names[i])) {
|
||||
workspace->CreateBlob(input_names[i]);
|
||||
}
|
||||
caffe2::TensorCPU* tensor =
|
||||
workspace->GetBlob(input_names[i])->GetMutable<caffe2::TensorCPU>();
|
||||
tensor->Resize(input_dims);
|
||||
if (caffe2::FLAGS_input_type == "float") {
|
||||
tensor->mutable_data<float>();
|
||||
} else {
|
||||
CAFFE_ENFORCE(
|
||||
caffe2::FLAGS_input_type == "uint8_t",
|
||||
"Only supported input types are: float, uint8_t");
|
||||
tensor->mutable_data<uint8_t>();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CAFFE_THROW(
|
||||
"You requested input tensors, but neither input_file nor "
|
||||
"input_dims is set.");
|
||||
}
|
||||
}
|
||||
|
||||
// Run main network.
|
||||
caffe2::NetDef net_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
|
||||
if (caffe2::FLAGS_backend != "builtin") {
|
||||
std::string engine = caffe2::FLAGS_backend == "nnpack"
|
||||
? "NNPACK"
|
||||
: caffe2::FLAGS_backend == "eigen" ? "EIGEN"
|
||||
: caffe2::FLAGS_backend == "mkl"
|
||||
? "MKLDNN"
|
||||
: caffe2::FLAGS_backend == "default" ? "" : "NONE";
|
||||
CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
|
||||
for (int i = 0; i < net_def.op_size(); i++) {
|
||||
caffe2::OperatorDef* op_def = net_def.mutable_op(i);
|
||||
op_def->set_engine(engine);
|
||||
}
|
||||
}
|
||||
|
||||
caffe2::NetBase* net = workspace->CreateNet(net_def);
|
||||
CHECK_NOTNULL(net);
|
||||
|
||||
LOG(INFO) << "Starting benchmark.";
|
||||
caffe2::ObserverConfig::initSampleRate(
|
||||
1, 1, 1, caffe2::FLAGS_run_individual, caffe2::FLAGS_warmup);
|
||||
LOG(INFO) << "Running warmup runs.";
|
||||
for (int i = 0; i < caffe2::FLAGS_warmup; ++i) {
|
||||
CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
|
||||
}
|
||||
|
||||
LOG(INFO) << "Main runs.";
|
||||
CAFFE_ENFORCE(
|
||||
caffe2::FLAGS_iter >= 0,
|
||||
"Number of main runs should be non negative, provided ",
|
||||
caffe2::FLAGS_iter,
|
||||
".");
|
||||
for (int i = 0; i < caffe2::FLAGS_iter; ++i) {
|
||||
caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, caffe2::FLAGS_warmup);
|
||||
CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
|
||||
if (caffe2::FLAGS_run_individual) {
|
||||
caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, caffe2::FLAGS_warmup);
|
||||
CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
|
||||
}
|
||||
}
|
||||
|
||||
string output_prefix = caffe2::FLAGS_output_folder.size()
|
||||
? caffe2::FLAGS_output_folder + "/"
|
||||
: "";
|
||||
if (caffe2::FLAGS_output.size()) {
|
||||
vector<string> output_names = caffe2::split(',', caffe2::FLAGS_output);
|
||||
if (caffe2::FLAGS_output == "*") {
|
||||
output_names = workspace->Blobs();
|
||||
}
|
||||
for (const string& name : output_names) {
|
||||
CAFFE_ENFORCE(
|
||||
workspace->HasBlob(name),
|
||||
"You requested a non-existing blob: ",
|
||||
name);
|
||||
if (caffe2::FLAGS_text_output) {
|
||||
auto blob = workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>();
|
||||
writeTextOutput(blob, output_prefix, name);
|
||||
} else {
|
||||
string serialized = workspace->GetBlob(name)->Serialize(name);
|
||||
string output_filename = output_prefix + name;
|
||||
caffe2::WriteStringToFile(serialized, output_filename.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
90
binaries/convert_caffe_image_db.cc
Normal file
90
binaries/convert_caffe_image_db.cc
Normal file
@ -0,0 +1,90 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe/proto/caffe.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(input_db, "", "The input db.");
|
||||
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
|
||||
CAFFE2_DEFINE_string(output_db, "", "The output db.");
|
||||
CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
|
||||
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
|
||||
|
||||
using caffe2::db::Cursor;
|
||||
using caffe2::db::DB;
|
||||
using caffe2::db::Transaction;
|
||||
using caffe2::TensorProto;
|
||||
using caffe2::TensorProtos;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
|
||||
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
|
||||
std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
|
||||
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
|
||||
std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
|
||||
int count = 0;
|
||||
for (; cursor->Valid(); cursor->Next()) {
|
||||
caffe::Datum datum;
|
||||
CAFFE_ENFORCE(datum.ParseFromString(cursor->value()));
|
||||
TensorProtos protos;
|
||||
TensorProto* data = protos.add_protos();
|
||||
TensorProto* label = protos.add_protos();
|
||||
label->set_data_type(TensorProto::INT32);
|
||||
label->add_dims(1);
|
||||
label->add_int32_data(datum.label());
|
||||
if (datum.encoded()) {
|
||||
// This is an encoded image. we will copy over the data directly.
|
||||
data->set_data_type(TensorProto::STRING);
|
||||
data->add_dims(1);
|
||||
data->add_string_data(datum.data());
|
||||
} else {
|
||||
// float data not supported right now.
|
||||
CAFFE_ENFORCE_EQ(datum.float_data_size(), 0);
|
||||
std::vector<char> buffer_vec(datum.data().size());
|
||||
char* buffer = buffer_vec.data();
|
||||
// swap order from CHW to HWC
|
||||
int channels = datum.channels();
|
||||
int size = datum.height() * datum.width();
|
||||
CAFFE_ENFORCE_EQ(datum.data().size(), channels * size);
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
char* dst = buffer + c;
|
||||
const char* src = datum.data().c_str() + c * size;
|
||||
for (int n = 0; n < size; ++n) {
|
||||
dst[n*channels] = src[n];
|
||||
}
|
||||
}
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
data->add_dims(datum.height());
|
||||
data->add_dims(datum.width());
|
||||
data->add_dims(datum.channels());
|
||||
data->set_byte_data(buffer, datum.data().size());
|
||||
}
|
||||
transaction->Put(cursor->key(), protos.SerializeAsString());
|
||||
if (++count % caffe2::FLAGS_batch_size == 0) {
|
||||
transaction->Commit();
|
||||
LOG(INFO) << "Converted " << count << " items so far.";
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "A total of " << count << " items processed.";
|
||||
return 0;
|
||||
}
|
||||
|
51
binaries/convert_db.cc
Normal file
51
binaries/convert_db.cc
Normal file
@ -0,0 +1,51 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(input_db, "", "The input db.");
|
||||
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
|
||||
CAFFE2_DEFINE_string(output_db, "", "The output db.");
|
||||
CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
|
||||
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
|
||||
|
||||
using caffe2::db::Cursor;
|
||||
using caffe2::db::DB;
|
||||
using caffe2::db::Transaction;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
|
||||
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
|
||||
std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
|
||||
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
|
||||
std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
|
||||
int count = 0;
|
||||
for (; cursor->Valid(); cursor->Next()) {
|
||||
transaction->Put(cursor->key(), cursor->value());
|
||||
if (++count % caffe2::FLAGS_batch_size == 0) {
|
||||
transaction->Commit();
|
||||
LOG(INFO) << "Converted " << count << " items so far.";
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "A total of " << count << " items processed.";
|
||||
return 0;
|
||||
}
|
156
binaries/convert_encoded_to_raw_leveldb.cc
Normal file
156
binaries/convert_encoded_to_raw_leveldb.cc
Normal file
@ -0,0 +1,156 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This script converts an image dataset to leveldb.
|
||||
//
|
||||
// caffe2::FLAGS_input_folder is the root folder that holds all the images, and
|
||||
// caffe2::FLAGS_list_file should be a list of files as well as their labels, in the
|
||||
// format as
|
||||
// subfolder1/file1.JPEG 7
|
||||
// ....
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
#include <fstream> // NOLINT(readability/streams)
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
|
||||
CAFFE2_DEFINE_string(input_db_name, "", "The input image file name.");
|
||||
CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
|
||||
CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
|
||||
CAFFE2_DEFINE_int(scale, 256,
|
||||
"If caffe2::FLAGS_raw is set, scale all the images' shorter edge to the given "
|
||||
"value.");
|
||||
CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
|
||||
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::string;
|
||||
using std::unique_ptr;
|
||||
|
||||
void ConvertToRawDataset(
|
||||
const string& input_db_name, const string& output_db_name) {
|
||||
// input leveldb
|
||||
std::unique_ptr<leveldb::DB> input_db;
|
||||
LOG(INFO) << "Opening input leveldb " << input_db_name;
|
||||
{
|
||||
leveldb::Options options;
|
||||
options.create_if_missing = false;
|
||||
leveldb::DB* db_temp;
|
||||
leveldb::Status status = leveldb::DB::Open(
|
||||
options, input_db_name, &db_temp);
|
||||
CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, ".");
|
||||
input_db.reset(db_temp);
|
||||
}
|
||||
|
||||
// output leveldb
|
||||
std::unique_ptr<leveldb::DB> output_db;
|
||||
std::unique_ptr<leveldb::WriteBatch> batch;
|
||||
LOG(INFO) << "Opening leveldb " << output_db_name;
|
||||
{
|
||||
leveldb::Options options;
|
||||
options.error_if_exists = true;
|
||||
options.create_if_missing = true;
|
||||
options.write_buffer_size = 268435456;
|
||||
leveldb::DB* db_temp;
|
||||
leveldb::Status status = leveldb::DB::Open(
|
||||
options, output_db_name, &db_temp);
|
||||
CAFFE_ENFORCE(
|
||||
status.ok(),
|
||||
"Failed to open leveldb ",
|
||||
output_db_name,
|
||||
". Is it already existing?");
|
||||
output_db.reset(db_temp);
|
||||
}
|
||||
batch.reset(new leveldb::WriteBatch());
|
||||
|
||||
TensorProtos input_protos;
|
||||
TensorProtos output_protos;
|
||||
TensorProto* data = output_protos.add_protos();
|
||||
TensorProto* label = output_protos.add_protos();
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
data->add_dims(0);
|
||||
data->add_dims(0);
|
||||
if (caffe2::FLAGS_color) {
|
||||
data->add_dims(3);
|
||||
}
|
||||
string value;
|
||||
|
||||
unique_ptr<leveldb::Iterator> iter;
|
||||
iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
|
||||
iter->SeekToFirst();
|
||||
int count = 0;
|
||||
for (; iter->Valid(); iter->Next()) {
|
||||
CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString()));
|
||||
label->CopyFrom(input_protos.protos(1));
|
||||
const string& encoded_image = input_protos.protos(0).string_data(0);
|
||||
int encoded_size = encoded_image.size();
|
||||
cv::Mat img = cv::imdecode(
|
||||
cv::Mat(1, &encoded_size, CV_8UC1,
|
||||
const_cast<char*>(encoded_image.data())),
|
||||
caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
|
||||
cv::Mat resized_img;
|
||||
int scaled_width, scaled_height;
|
||||
if (caffe2::FLAGS_warp) {
|
||||
scaled_width = caffe2::FLAGS_scale;
|
||||
scaled_height = caffe2::FLAGS_scale;
|
||||
} else if (img.rows > img.cols) {
|
||||
scaled_width = caffe2::FLAGS_scale;
|
||||
scaled_height = static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
|
||||
} else {
|
||||
scaled_height = caffe2::FLAGS_scale;
|
||||
scaled_width = static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
|
||||
}
|
||||
cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
|
||||
cv::INTER_LINEAR);
|
||||
data->set_dims(0, scaled_height);
|
||||
data->set_dims(1, scaled_width);
|
||||
DCHECK(resized_img.isContinuous());
|
||||
data->set_byte_data(resized_img.ptr(),
|
||||
scaled_height * scaled_width * (caffe2::FLAGS_color ? 3 : 1));
|
||||
output_protos.SerializeToString(&value);
|
||||
// Put in db
|
||||
batch->Put(iter->key(), value);
|
||||
if (++count % 1000 == 0) {
|
||||
output_db->Write(leveldb::WriteOptions(), batch.get());
|
||||
batch.reset(new leveldb::WriteBatch());
|
||||
LOG(INFO) << "Processed " << count << " files.";
|
||||
}
|
||||
}
|
||||
// write the last batch
|
||||
if (count % 1000 != 0) {
|
||||
output_db->Write(leveldb::WriteOptions(), batch.get());
|
||||
}
|
||||
LOG(INFO) << "Processed a total of " << count << " files.";
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::ConvertToRawDataset(
|
||||
caffe2::FLAGS_input_db_name, caffe2::FLAGS_output_db_name);
|
||||
return 0;
|
||||
}
|
223
binaries/core_overhead_benchmark.cc
Normal file
223
binaries/core_overhead_benchmark.cc
Normal file
@ -0,0 +1,223 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
#define CAFFE2_SKIP_IF_NO_GPU \
|
||||
if (!caffe2::NumCudaDevices()) { \
|
||||
state.SkipWithError("No CUDA available, skipping benchmark."); \
|
||||
return; \
|
||||
}
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
static void BM_CUDAContextCreation(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
volatile CUDAContext context_so_we_do_initialization_work;
|
||||
while (state.KeepRunning()) {
|
||||
volatile CUDAContext context;
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_CUDAContextCreation);
|
||||
|
||||
static void BM_CUDAContextStreamAccess(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
CUDAContext context;
|
||||
while (state.KeepRunning()) {
|
||||
volatile cudaStream_t stream = context.cuda_stream();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_CUDAContextStreamAccess);
|
||||
|
||||
static void BM_cudaGetDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
int id;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaGetDevice(&id));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaGetDevice);
|
||||
|
||||
static void BM_cudaSetDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
int total = NumCudaDevices();
|
||||
int i = 0;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaSetDevice((i++) % total));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaSetDevice);
|
||||
|
||||
static void BM_cudaSetAndGetDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
int total = NumCudaDevices();
|
||||
int i = 0;
|
||||
int id;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaSetDevice((i++) % total));
|
||||
CUDA_ENFORCE(cudaGetDevice(&id));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaSetAndGetDevice);
|
||||
|
||||
static void BM_cudaSetSameDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaSetDevice(0));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaSetSameDevice);
|
||||
|
||||
static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
CUDA_ENFORCE(cudaStreamDestroy(stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaStreamCreateSyncDelete);
|
||||
|
||||
static void BM_cudaStreamSynchronize(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaStreamSynchronize);
|
||||
|
||||
static void BM_cudaEventRecord(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t event;
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
CUDA_ENFORCE(cudaEventCreateWithFlags(
|
||||
&event, cudaEventDefault | cudaEventDisableTiming));
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaEventRecord(event, stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaEventRecord);
|
||||
|
||||
static void BM_cudaStreamWaitEventThenStreamSynchronize(
|
||||
benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t event;
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
CUDA_ENFORCE(cudaEventCreateWithFlags(
|
||||
&event, cudaEventDefault | cudaEventDisableTiming));
|
||||
CUDA_ENFORCE(cudaEventRecord(event, stream));
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
|
||||
|
||||
static void BM_CudaPointerAffinity(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
|
||||
float* ptr = tensor.mutable_data<float>();
|
||||
while (state.KeepRunning()) {
|
||||
volatile int id = GetGPUIDForPointer(ptr);
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_CudaPointerAffinity);
|
||||
|
||||
namespace {
|
||||
template <class Context>
|
||||
class DummyEmptyOp : public Operator<Context> {
|
||||
public:
|
||||
DummyEmptyOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws) {}
|
||||
|
||||
bool RunOnDevice() final { return true; }
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
|
||||
REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
|
||||
OPERATOR_SCHEMA(DummyEmpty);
|
||||
} // namespace
|
||||
|
||||
static void BM_OperatorCreationCPU(benchmark::State& state) {
|
||||
std::unique_ptr<OperatorBase> op;
|
||||
OperatorDef def;
|
||||
Workspace ws;
|
||||
def.set_type("DummyEmpty");
|
||||
def.mutable_device_option()->set_device_type(CPU);
|
||||
while (state.KeepRunning()) {
|
||||
op = CreateOperator(def, &ws);
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_OperatorCreationCPU);
|
||||
|
||||
static void BM_OperatorCreationCUDA(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
std::unique_ptr<OperatorBase> op;
|
||||
OperatorDef def;
|
||||
Workspace ws;
|
||||
def.set_type("DummyEmpty");
|
||||
def.mutable_device_option()->set_device_type(CUDA);
|
||||
while (state.KeepRunning()) {
|
||||
op = CreateOperator(def, &ws);
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_OperatorCreationCUDA);
|
||||
|
||||
static void BM_RawAllocDeallocCPU(benchmark::State& state) {
|
||||
while (state.KeepRunning()) {
|
||||
// Allocating only 1 byte in order to measure the overhead.
|
||||
auto ptr_and_deleter = GetCPUAllocator()->New(1);
|
||||
// Deallocate.
|
||||
ptr_and_deleter.second(ptr_and_deleter.first);
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_RawAllocDeallocCPU);
|
||||
|
||||
static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
|
||||
Tensor<CPUContext> tensor;
|
||||
// small allocation
|
||||
tensor.Resize(32, 32);
|
||||
while (state.KeepRunning()) {
|
||||
CHECK(tensor.mutable_data<float>());
|
||||
tensor.FreeMemory();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_TensorAllocDeallocCPU);
|
||||
|
||||
static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
Tensor<CUDAContext> tensor;
|
||||
// small allocation
|
||||
tensor.Resize(32, 32);
|
||||
while (state.KeepRunning()) {
|
||||
CHECK(tensor.mutable_data<float>());
|
||||
tensor.FreeMemory();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_TensorAllocDeallocCUDA);
|
||||
|
||||
BENCHMARK_MAIN()
|
98
binaries/db_throughput.cc
Normal file
98
binaries/db_throughput.cc
Normal file
@ -0,0 +1,98 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(input_db, "", "The input db.");
|
||||
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
|
||||
CAFFE2_DEFINE_int(report_interval, 1000, "The report interval.");
|
||||
CAFFE2_DEFINE_int(repeat, 10, "The number to repeat the throughput test.");
|
||||
CAFFE2_DEFINE_bool(use_reader, false, "If true, use the reader interface.");
|
||||
CAFFE2_DEFINE_int(num_read_threads, 1,
|
||||
"The number of concurrent reading threads.");
|
||||
|
||||
using caffe2::db::Cursor;
|
||||
using caffe2::db::DB;
|
||||
using caffe2::db::DBReader;
|
||||
using caffe2::string;
|
||||
|
||||
void TestThroughputWithDB() {
|
||||
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
|
||||
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
|
||||
for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
|
||||
caffe2::Timer timer;
|
||||
for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
|
||||
string key = cursor->key();
|
||||
string value = cursor->value();
|
||||
//VLOG(1) << "Key " << key;
|
||||
cursor->Next();
|
||||
if (!cursor->Valid()) {
|
||||
cursor->SeekToFirst();
|
||||
}
|
||||
}
|
||||
double elapsed_seconds = timer.Seconds();
|
||||
printf("Iteration %03d, took %4.5f seconds, throughput %f items/sec.\n",
|
||||
iter_id, elapsed_seconds,
|
||||
caffe2::FLAGS_report_interval / elapsed_seconds);
|
||||
}
|
||||
}
|
||||
|
||||
void TestThroughputWithReaderWorker(const DBReader* reader, int thread_id) {
|
||||
string key, value;
|
||||
for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
|
||||
caffe2::Timer timer;
|
||||
for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
|
||||
reader->Read(&key, &value);
|
||||
}
|
||||
double elapsed_seconds = timer.Seconds();
|
||||
printf("Thread %03d iteration %03d, took %4.5f seconds, "
|
||||
"throughput %f items/sec.\n",
|
||||
thread_id, iter_id, elapsed_seconds,
|
||||
caffe2::FLAGS_report_interval / elapsed_seconds);
|
||||
}
|
||||
}
|
||||
|
||||
void TestThroughputWithReader() {
|
||||
caffe2::db::DBReader reader(
|
||||
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db);
|
||||
std::vector<std::unique_ptr<std::thread>> reading_threads(
|
||||
caffe2::FLAGS_num_read_threads);
|
||||
for (int i = 0; i < reading_threads.size(); ++i) {
|
||||
reading_threads[i].reset(new std::thread(
|
||||
TestThroughputWithReaderWorker, &reader, i));
|
||||
}
|
||||
for (int i = 0; i < reading_threads.size(); ++i) {
|
||||
reading_threads[i]->join();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
if (caffe2::FLAGS_use_reader) {
|
||||
TestThroughputWithReader();
|
||||
} else {
|
||||
TestThroughputWithDB();
|
||||
}
|
||||
return 0;
|
||||
}
|
57
binaries/inspect_gpus.cc
Normal file
57
binaries/inspect_gpus.cc
Normal file
@ -0,0 +1,57 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
using std::vector;
|
||||
|
||||
CAFFE2_DECLARE_int(caffe2_log_level);
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::SetUsageMessage(
|
||||
"Inspects the GPUs on the current machine and prints out their details "
|
||||
"provided by cuda.");
|
||||
|
||||
int gpu_count;
|
||||
CUDA_ENFORCE(cudaGetDeviceCount(&gpu_count));
|
||||
for (int i = 0; i < gpu_count; ++i) {
|
||||
LOG(INFO) << "Querying device ID = " << i;
|
||||
caffe2::DeviceQuery(i);
|
||||
}
|
||||
|
||||
vector<vector<bool> > access_pattern;
|
||||
CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern));
|
||||
|
||||
std::stringstream sstream;
|
||||
// Find topology
|
||||
for (int i = 0; i < gpu_count; ++i) {
|
||||
for (int j = 0; j < gpu_count; ++j) {
|
||||
sstream << (access_pattern[i][j] ? "+" : "-") << " ";
|
||||
}
|
||||
sstream << std::endl;
|
||||
}
|
||||
LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
|
||||
|
||||
return 0;
|
||||
}
|
148
binaries/make_cifar_db.cc
Normal file
148
binaries/make_cifar_db.cc
Normal file
@ -0,0 +1,148 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
//
|
||||
// This script converts the CIFAR dataset to the leveldb format used
|
||||
// by caffe to perform classification.
|
||||
// Usage:
|
||||
// convert_cifar_data input_folder output_db_file
|
||||
// The CIFAR dataset could be downloaded at
|
||||
// http://www.cs.toronto.edu/~kriz/cifar.html
|
||||
|
||||
#include <array>
|
||||
#include <fstream> // NOLINT(readability/streams)
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(input_folder, "", "The input folder name.");
|
||||
CAFFE2_DEFINE_string(output_train_db_name,
|
||||
"", "The output training db name.");
|
||||
CAFFE2_DEFINE_string(output_test_db_name,
|
||||
"", "The output testing db name.");
|
||||
CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
|
||||
CAFFE2_DEFINE_bool(is_cifar100, false,
|
||||
"If set, convert cifar100. Otherwise do cifar10.");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::stringstream;
|
||||
|
||||
const int kCIFARSize = 32;
|
||||
const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
|
||||
const int kCIFAR10BatchSize = 10000;
|
||||
const int kCIFAR10TestDataSize = 10000;
|
||||
const int kCIFAR10TrainBatches = 5;
|
||||
|
||||
const int kCIFAR100TrainDataSize = 50000;
|
||||
const int kCIFAR100TestDataSize = 10000;
|
||||
|
||||
void ReadImage(std::ifstream* file, int* label, char* buffer) {
|
||||
char label_char;
|
||||
if (caffe2::FLAGS_is_cifar100) {
|
||||
// Skip the coarse label.
|
||||
file->read(&label_char, 1);
|
||||
}
|
||||
file->read(&label_char, 1);
|
||||
*label = label_char;
|
||||
// Yes, there are better ways to do it, like in-place swap... but I am too
|
||||
// lazy so let's just write it in a memory-wasteful way.
|
||||
std::array<char, kCIFARImageNBytes> channel_first_storage;
|
||||
file->read(channel_first_storage.data(), kCIFARImageNBytes);
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
|
||||
buffer[i * 3 + c] =
|
||||
channel_first_storage[c * kCIFARSize * kCIFARSize + i];
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void WriteToDB(const string& filename, const int num_items,
|
||||
const int& offset, db::DB* db) {
|
||||
TensorProtos protos;
|
||||
TensorProto* data = protos.add_protos();
|
||||
TensorProto* label = protos.add_protos();
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
data->add_dims(kCIFARSize);
|
||||
data->add_dims(kCIFARSize);
|
||||
data->add_dims(3);
|
||||
label->set_data_type(TensorProto::INT32);
|
||||
label->add_dims(1);
|
||||
label->add_int32_data(0);
|
||||
|
||||
LOG(INFO) << "Converting file " << filename;
|
||||
std::ifstream data_file(filename.c_str(),
|
||||
std::ios::in | std::ios::binary);
|
||||
CAFFE_ENFORCE(data_file, "Unable to open file ", filename);
|
||||
char str_buffer[kCIFARImageNBytes];
|
||||
int label_value;
|
||||
string serialized_protos;
|
||||
std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
|
||||
for (int itemid = 0; itemid < num_items; ++itemid) {
|
||||
ReadImage(&data_file, &label_value, str_buffer);
|
||||
data->set_byte_data(str_buffer, kCIFARImageNBytes);
|
||||
label->set_int32_data(0, label_value);
|
||||
protos.SerializeToString(&serialized_protos);
|
||||
snprintf(str_buffer, kCIFARImageNBytes, "%05d",
|
||||
offset + itemid);
|
||||
transaction->Put(string(str_buffer), serialized_protos);
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertCIFAR() {
|
||||
std::unique_ptr<db::DB> train_db(
|
||||
db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_train_db_name,
|
||||
db::NEW));
|
||||
std::unique_ptr<db::DB> test_db(
|
||||
db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_test_db_name,
|
||||
db::NEW));
|
||||
|
||||
if (!caffe2::FLAGS_is_cifar100) {
|
||||
// This is cifar 10.
|
||||
for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
|
||||
stringstream train_file;
|
||||
train_file << caffe2::FLAGS_input_folder << "/data_batch_" << fileid + 1
|
||||
<< ".bin";
|
||||
WriteToDB(train_file.str(), kCIFAR10BatchSize,
|
||||
fileid * kCIFAR10BatchSize, train_db.get());
|
||||
}
|
||||
stringstream test_file;
|
||||
test_file << caffe2::FLAGS_input_folder << "/test_batch.bin";
|
||||
WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
|
||||
} else {
|
||||
// This is cifar 100.
|
||||
stringstream train_file;
|
||||
train_file << caffe2::FLAGS_input_folder << "/train.bin";
|
||||
WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
|
||||
stringstream test_file;
|
||||
test_file << caffe2::FLAGS_input_folder << "/test.bin";
|
||||
WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::ConvertCIFAR();
|
||||
return 0;
|
||||
}
|
280
binaries/make_image_db.cc
Normal file
280
binaries/make_image_db.cc
Normal file
@ -0,0 +1,280 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This script converts an image dataset to a database.
|
||||
//
|
||||
// caffe2::FLAGS_input_folder is the root folder that holds all the images
|
||||
//
|
||||
// caffe2::FLAGS_list_file is the path to a file containing a list of files
|
||||
// and their labels, as follows:
|
||||
//
|
||||
// subfolder1/file1.JPEG 7
|
||||
// subfolder1/file2.JPEG 7
|
||||
// subfolder2/file1.JPEG 8
|
||||
// ...
|
||||
//
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <queue>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_bool(shuffle, false,
|
||||
"Randomly shuffle the order of images and their labels");
|
||||
CAFFE2_DEFINE_string(input_folder, "", "The input image file name.");
|
||||
CAFFE2_DEFINE_string(
|
||||
list_file,
|
||||
"",
|
||||
"The text file containing the list of images.");
|
||||
CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
|
||||
CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
|
||||
CAFFE2_DEFINE_bool(raw, false,
|
||||
"If set, we pre-read the images and store the raw buffer.");
|
||||
CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
|
||||
CAFFE2_DEFINE_int(
|
||||
scale,
|
||||
256,
|
||||
"If caffe2::FLAGS_raw is set, scale the shorter edge to the given value.");
|
||||
CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
|
||||
CAFFE2_DEFINE_int(
|
||||
num_threads,
|
||||
-1,
|
||||
"Number of image parsing and conversion threads.");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class Converter {
|
||||
public:
|
||||
explicit Converter() {
|
||||
data_ = protos_.add_protos();
|
||||
label_ = protos_.add_protos();
|
||||
if (caffe2::FLAGS_raw) {
|
||||
data_->set_data_type(TensorProto::BYTE);
|
||||
data_->add_dims(0);
|
||||
data_->add_dims(0);
|
||||
if (caffe2::FLAGS_color) {
|
||||
data_->add_dims(3);
|
||||
}
|
||||
} else {
|
||||
data_->set_data_type(TensorProto::STRING);
|
||||
data_->add_dims(1);
|
||||
data_->add_string_data("");
|
||||
}
|
||||
label_->set_data_type(TensorProto::INT32);
|
||||
label_->add_dims(1);
|
||||
label_->add_int32_data(0);
|
||||
}
|
||||
|
||||
~Converter() {
|
||||
if (thread_.joinable()) {
|
||||
thread_.join();
|
||||
}
|
||||
}
|
||||
|
||||
void queue(const std::pair<std::string, int>& pair) {
|
||||
in_.push(pair);
|
||||
}
|
||||
|
||||
void start() {
|
||||
thread_ = std::thread(&Converter::run, this);
|
||||
}
|
||||
|
||||
std::string get() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
while (out_.empty()) {
|
||||
cv_.wait(lock);
|
||||
}
|
||||
|
||||
auto value = out_.front();
|
||||
out_.pop();
|
||||
cv_.notify_one();
|
||||
return value;
|
||||
}
|
||||
|
||||
void run() {
|
||||
const auto& input_folder = caffe2::FLAGS_input_folder;
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
std::string value;
|
||||
while (!in_.empty()) {
|
||||
auto pair = in_.front();
|
||||
in_.pop();
|
||||
lock.unlock();
|
||||
|
||||
label_->set_int32_data(0, pair.second);
|
||||
|
||||
// Add raw file contents to DB if !raw
|
||||
if (!caffe2::FLAGS_raw) {
|
||||
std::ifstream image_file_stream(input_folder + pair.first);
|
||||
if (!image_file_stream) {
|
||||
LOG(ERROR) << "Cannot open " << input_folder << pair.first
|
||||
<< ". Skipping.";
|
||||
} else {
|
||||
data_->mutable_string_data(0)->assign(
|
||||
std::istreambuf_iterator<char>(image_file_stream),
|
||||
std::istreambuf_iterator<char>());
|
||||
}
|
||||
} else {
|
||||
// Load image
|
||||
cv::Mat img = cv::imread(
|
||||
input_folder + pair.first,
|
||||
caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR
|
||||
: CV_LOAD_IMAGE_GRAYSCALE);
|
||||
|
||||
// Resize image
|
||||
cv::Mat resized_img;
|
||||
int scaled_width, scaled_height;
|
||||
if (caffe2::FLAGS_warp) {
|
||||
scaled_width = caffe2::FLAGS_scale;
|
||||
scaled_height = caffe2::FLAGS_scale;
|
||||
} else if (img.rows > img.cols) {
|
||||
scaled_width = caffe2::FLAGS_scale;
|
||||
scaled_height =
|
||||
static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
|
||||
} else {
|
||||
scaled_height = caffe2::FLAGS_scale;
|
||||
scaled_width =
|
||||
static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
|
||||
}
|
||||
cv::resize(
|
||||
img,
|
||||
resized_img,
|
||||
cv::Size(scaled_width, scaled_height),
|
||||
0,
|
||||
0,
|
||||
cv::INTER_LINEAR);
|
||||
data_->set_dims(0, scaled_height);
|
||||
data_->set_dims(1, scaled_width);
|
||||
|
||||
// Assert we don't have to deal with alignment
|
||||
DCHECK(resized_img.isContinuous());
|
||||
auto nbytes = resized_img.total() * resized_img.elemSize();
|
||||
data_->set_byte_data(resized_img.ptr(), nbytes);
|
||||
}
|
||||
|
||||
protos_.SerializeToString(&value);
|
||||
|
||||
// Add serialized proto to out queue or wait if it is not empty
|
||||
lock.lock();
|
||||
while (!out_.empty()) {
|
||||
cv_.wait(lock);
|
||||
}
|
||||
out_.push(value);
|
||||
cv_.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
TensorProtos protos_;
|
||||
TensorProto* data_;
|
||||
TensorProto* label_;
|
||||
std::queue<std::pair<std::string, int>> in_;
|
||||
std::queue<std::string> out_;
|
||||
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cv_;
|
||||
std::thread thread_;
|
||||
};
|
||||
|
||||
void ConvertImageDataset(
|
||||
const string& input_folder,
|
||||
const string& list_filename,
|
||||
const string& output_db_name,
|
||||
const bool /*shuffle*/) {
|
||||
std::ifstream list_file(list_filename);
|
||||
std::vector<std::pair<std::string, int> > lines;
|
||||
std::string filename;
|
||||
int file_label;
|
||||
while (list_file >> filename >> file_label) {
|
||||
lines.push_back(std::make_pair(filename, file_label));
|
||||
}
|
||||
|
||||
if (caffe2::FLAGS_shuffle) {
|
||||
LOG(INFO) << "Shuffling data";
|
||||
std::shuffle(lines.begin(), lines.end(), std::default_random_engine(1701));
|
||||
}
|
||||
|
||||
auto num_threads = caffe2::FLAGS_num_threads;
|
||||
if (num_threads < 1) {
|
||||
num_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
|
||||
LOG(INFO) << "Processing " << lines.size() << " images...";
|
||||
LOG(INFO) << "Opening DB " << output_db_name;
|
||||
|
||||
auto db = db::CreateDB(caffe2::FLAGS_db, output_db_name, db::NEW);
|
||||
auto transaction = db->NewTransaction();
|
||||
|
||||
LOG(INFO) << "Using " << num_threads << " processing threads...";
|
||||
std::vector<Converter> converters(num_threads);
|
||||
|
||||
// Queue entries across converters
|
||||
for (auto i = 0; i < lines.size(); i++) {
|
||||
converters[i % converters.size()].queue(lines[i]);
|
||||
}
|
||||
|
||||
// Start all converters
|
||||
for (auto& converter : converters) {
|
||||
converter.start();
|
||||
}
|
||||
|
||||
constexpr auto key_max_length = 256;
|
||||
char key_cstr[key_max_length];
|
||||
string value;
|
||||
int count = 0;
|
||||
for (auto i = 0; i < lines.size(); i++) {
|
||||
// Get serialized proto for this entry
|
||||
auto value = converters[i % converters.size()].get();
|
||||
|
||||
// Synthesize key for this entry
|
||||
auto key_len = snprintf(
|
||||
key_cstr, sizeof(key_cstr), "%08d_%s", i, lines[i].first.c_str());
|
||||
DCHECK_LE(key_len, sizeof(key_cstr));
|
||||
|
||||
// Put in db
|
||||
transaction->Put(string(key_cstr), value);
|
||||
|
||||
if (++count % 1000 == 0) {
|
||||
// Commit the current writes.
|
||||
transaction->Commit();
|
||||
LOG(INFO) << "Processed " << count << " files.";
|
||||
}
|
||||
}
|
||||
|
||||
// Commit final transaction
|
||||
transaction->Commit();
|
||||
LOG(INFO) << "Processed " << count << " files.";
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::ConvertImageDataset(
|
||||
caffe2::FLAGS_input_folder, caffe2::FLAGS_list_file,
|
||||
caffe2::FLAGS_output_db_name, caffe2::FLAGS_shuffle);
|
||||
return 0;
|
||||
}
|
139
binaries/make_mnist_db.cc
Normal file
139
binaries/make_mnist_db.cc
Normal file
@ -0,0 +1,139 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This script converts the MNIST dataset to leveldb.
|
||||
// The MNIST dataset could be downloaded at
|
||||
// http://yann.lecun.com/exdb/mnist/
|
||||
|
||||
#include <fstream> // NOLINT(readability/streams)
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(image_file, "", "The input image file name.");
|
||||
CAFFE2_DEFINE_string(label_file, "", "The label file name.");
|
||||
CAFFE2_DEFINE_string(output_file, "", "The output db name.");
|
||||
CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
|
||||
CAFFE2_DEFINE_int(data_limit, -1,
|
||||
"If set, only output this number of data points.");
|
||||
CAFFE2_DEFINE_bool(channel_first, false,
|
||||
"If set, write the data as channel-first (CHW order) as the old "
|
||||
"Caffe does.");
|
||||
|
||||
namespace caffe2 {
|
||||
uint32_t swap_endian(uint32_t val) {
|
||||
val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
|
||||
return (val << 16) | (val >> 16);
|
||||
}
|
||||
|
||||
void convert_dataset(const char* image_filename, const char* label_filename,
|
||||
const char* db_path, const int data_limit) {
|
||||
// Open files
|
||||
std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
|
||||
std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
|
||||
CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename);
|
||||
CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename);
|
||||
// Read the magic and the meta data
|
||||
uint32_t magic;
|
||||
uint32_t num_items;
|
||||
uint32_t num_labels;
|
||||
uint32_t rows;
|
||||
uint32_t cols;
|
||||
|
||||
image_file.read(reinterpret_cast<char*>(&magic), 4);
|
||||
magic = swap_endian(magic);
|
||||
if (magic == 529205256) {
|
||||
LOG(FATAL) <<
|
||||
"It seems that you forgot to unzip the mnist dataset. You should "
|
||||
"first unzip them using e.g. gunzip on Linux.";
|
||||
}
|
||||
CAFFE_ENFORCE_EQ(magic, 2051, "Incorrect image file magic.");
|
||||
label_file.read(reinterpret_cast<char*>(&magic), 4);
|
||||
magic = swap_endian(magic);
|
||||
CAFFE_ENFORCE_EQ(magic, 2049, "Incorrect label file magic.");
|
||||
image_file.read(reinterpret_cast<char*>(&num_items), 4);
|
||||
num_items = swap_endian(num_items);
|
||||
label_file.read(reinterpret_cast<char*>(&num_labels), 4);
|
||||
num_labels = swap_endian(num_labels);
|
||||
CAFFE_ENFORCE_EQ(num_items, num_labels);
|
||||
image_file.read(reinterpret_cast<char*>(&rows), 4);
|
||||
rows = swap_endian(rows);
|
||||
image_file.read(reinterpret_cast<char*>(&cols), 4);
|
||||
cols = swap_endian(cols);
|
||||
|
||||
// leveldb
|
||||
std::unique_ptr<db::DB> mnist_db(db::CreateDB(caffe2::FLAGS_db, db_path, db::NEW));
|
||||
std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
|
||||
// Storing to db
|
||||
char label_value;
|
||||
std::vector<char> pixels(rows * cols);
|
||||
int count = 0;
|
||||
const int kMaxKeyLength = 10;
|
||||
char key_cstr[kMaxKeyLength];
|
||||
string value;
|
||||
|
||||
TensorProtos protos;
|
||||
TensorProto* data = protos.add_protos();
|
||||
TensorProto* label = protos.add_protos();
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
if (caffe2::FLAGS_channel_first) {
|
||||
data->add_dims(1);
|
||||
data->add_dims(rows);
|
||||
data->add_dims(cols);
|
||||
} else {
|
||||
data->add_dims(rows);
|
||||
data->add_dims(cols);
|
||||
data->add_dims(1);
|
||||
}
|
||||
label->set_data_type(TensorProto::INT32);
|
||||
label->add_int32_data(0);
|
||||
|
||||
LOG(INFO) << "A total of " << num_items << " items.";
|
||||
LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
|
||||
for (int item_id = 0; item_id < num_items; ++item_id) {
|
||||
image_file.read(pixels.data(), rows * cols);
|
||||
label_file.read(&label_value, 1);
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
data->set_byte_data(pixels.data(), rows * cols);
|
||||
}
|
||||
label->set_int32_data(0, static_cast<int>(label_value));
|
||||
snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
|
||||
protos.SerializeToString(&value);
|
||||
string keystr(key_cstr);
|
||||
|
||||
// Put in db
|
||||
transaction->Put(keystr, value);
|
||||
if (++count % 1000 == 0) {
|
||||
transaction->Commit();
|
||||
}
|
||||
if (data_limit > 0 && count == data_limit) {
|
||||
LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace caffe2
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::convert_dataset(caffe2::FLAGS_image_file.c_str(), caffe2::FLAGS_label_file.c_str(),
|
||||
caffe2::FLAGS_output_file.c_str(), caffe2::FLAGS_data_limit);
|
||||
return 0;
|
||||
}
|
57
binaries/predictor_verifier.cc
Normal file
57
binaries/predictor_verifier.cc
Normal file
@ -0,0 +1,57 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/flags.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/predictor.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
|
||||
CAFFE2_DEFINE_string(init_net, "", "The given path to the init protobuffer.");
|
||||
CAFFE2_DEFINE_string(
|
||||
predict_net,
|
||||
"",
|
||||
"The given path to the predict protobuffer.");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
void run() {
|
||||
if (FLAGS_init_net.empty()) {
|
||||
LOG(FATAL) << "No init net specified. Use --init_net=/path/to/net.";
|
||||
}
|
||||
if (FLAGS_predict_net.empty()) {
|
||||
LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net.";
|
||||
}
|
||||
caffe2::NetDef init_net, predict_net;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net));
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
|
||||
// Can be large due to constant fills
|
||||
VLOG(1) << "Init net: " << ProtoDebugString(init_net);
|
||||
LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net);
|
||||
auto predictor = caffe2::make_unique<Predictor>(init_net, predict_net);
|
||||
LOG(INFO) << "Checking that a null forward-pass works";
|
||||
Predictor::TensorVector inputVec, outputVec;
|
||||
predictor->run(inputVec, &outputVec);
|
||||
CAFFE_ENFORCE_GT(outputVec.size(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::run();
|
||||
// This is to allow us to use memory leak checks.
|
||||
caffe2::ShutdownProtobufLibrary();
|
||||
return 0;
|
||||
}
|
42
binaries/print_core_object_sizes.cc
Normal file
42
binaries/print_core_object_sizes.cc
Normal file
@ -0,0 +1,42 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
#define PRINT_SIZE(cls) \
|
||||
std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \
|
||||
<< std::endl;
|
||||
|
||||
int main(int /* unused */, char** /* unused */) {
|
||||
PRINT_SIZE(caffe2::Blob);
|
||||
PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
|
||||
PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
|
||||
PRINT_SIZE(caffe2::CPUContext);
|
||||
PRINT_SIZE(caffe2::CUDAContext);
|
||||
PRINT_SIZE(caffe2::OperatorBase);
|
||||
PRINT_SIZE(caffe2::OperatorDef);
|
||||
PRINT_SIZE(caffe2::Operator<caffe2::CPUContext>);
|
||||
PRINT_SIZE(caffe2::Operator<caffe2::CUDAContext>);
|
||||
PRINT_SIZE(caffe2::TypeMeta);
|
||||
PRINT_SIZE(caffe2::Workspace);
|
||||
return 0;
|
||||
}
|
73
binaries/print_registered_core_operators.cc
Normal file
73
binaries/print_registered_core_operators.cc
Normal file
@ -0,0 +1,73 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/operator_schema.h"
|
||||
|
||||
CAFFE2_DEFINE_string(schema, "",
|
||||
"Print doc and schema of a particular operator");
|
||||
|
||||
static bool HasSchema(const std::string& str) {
|
||||
return caffe2::OpSchemaRegistry::Schema(str);
|
||||
}
|
||||
|
||||
static bool HasDoc(const std::string& str) {
|
||||
const auto* schema = caffe2::OpSchemaRegistry::Schema(str);
|
||||
return (schema != nullptr) && (schema->doc() != nullptr);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
|
||||
if (!caffe2::FLAGS_schema.empty()) {
|
||||
const auto* schema = caffe2::OpSchemaRegistry::Schema(
|
||||
caffe2::FLAGS_schema);
|
||||
if (!schema) {
|
||||
std::cerr << "Operator " << caffe2::FLAGS_schema
|
||||
<< " doesn't have a schema" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
std::cout << "Operator " << caffe2::FLAGS_schema << ": " << std::endl
|
||||
<< *schema;
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (const auto& pair : *caffe2::gDeviceTypeRegistry()) {
|
||||
std::cout << "Device type " << pair.first
|
||||
#ifndef CAFFE2_USE_LITE_PROTO
|
||||
<< " (" << caffe2::DeviceType_Name(
|
||||
static_cast<caffe2::DeviceType>(pair.first))
|
||||
<< ")"
|
||||
#endif
|
||||
<< std::endl;
|
||||
for (const auto& key : pair.second->Keys()) {
|
||||
std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key)
|
||||
<< ")\t" << key << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Operators that have gradients registered:" << std::endl;
|
||||
for (const auto& key : caffe2::GradientRegistry()->Keys()) {
|
||||
std::cout << "\t(schema: " << HasSchema(key) << ", doc: "
|
||||
<< HasDoc(key) << ")\t"
|
||||
<< key << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
40
binaries/run_plan.cc
Normal file
40
binaries/run_plan.cc
Normal file
@ -0,0 +1,40 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
if (caffe2::FLAGS_plan.size() == 0) {
|
||||
LOG(ERROR) << "No plan specified. Use --plan=/path/to/plan.";
|
||||
return 0;
|
||||
}
|
||||
LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
|
||||
caffe2::PlanDef plan_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
|
||||
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
|
||||
workspace->RunPlan(plan_def);
|
||||
|
||||
// This is to allow us to use memory leak checks.
|
||||
caffe2::ShutdownProtobufLibrary();
|
||||
return 0;
|
||||
}
|
48
binaries/run_plan_mpi.cc
Normal file
48
binaries/run_plan_mpi.cc
Normal file
@ -0,0 +1,48 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::SetUsageMessage("Runs a caffe2 plan that has MPI operators in it.");
|
||||
int mpi_ret;
|
||||
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
|
||||
if (mpi_ret != MPI_THREAD_MULTIPLE &&
|
||||
mpi_ret != MPI_THREAD_SERIALIZED) {
|
||||
std::cerr << "Caffe2 MPI requires the underlying MPI to support the "
|
||||
"MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.\n";
|
||||
return 1;
|
||||
}
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
|
||||
caffe2::PlanDef plan_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
|
||||
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
|
||||
workspace->RunPlan(plan_def);
|
||||
|
||||
// This is to allow us to use memory leak checks.
|
||||
caffe2::ShutdownProtobufLibrary();
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
193
binaries/speed_benchmark.cc
Normal file
193
binaries/speed_benchmark.cc
Normal file
@ -0,0 +1,193 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "caffe2/utils/string_utils.h"
|
||||
|
||||
CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
|
||||
CAFFE2_DEFINE_string(
|
||||
init_net,
|
||||
"",
|
||||
"The given net to initialize any parameters.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input,
|
||||
"",
|
||||
"Input that is needed for running the network. If "
|
||||
"multiple input needed, use comma separated string.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input_file,
|
||||
"",
|
||||
"Input file that contain the serialized protobuf for "
|
||||
"the input blobs. If multiple input needed, use comma "
|
||||
"separated string. Must have the same number of items "
|
||||
"as input does.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input_dims,
|
||||
"",
|
||||
"Alternate to input_files, if all inputs are simple "
|
||||
"float TensorCPUs, specify the dimension using comma "
|
||||
"separated numbers. If multiple input needed, use "
|
||||
"semicolon to separate the dimension of different "
|
||||
"tensors.");
|
||||
CAFFE2_DEFINE_string(
|
||||
input_type,
|
||||
"", "Input type (uint8_t/float)");
|
||||
CAFFE2_DEFINE_string(
|
||||
output,
|
||||
"",
|
||||
"Output that should be dumped after the execution "
|
||||
"finishes. If multiple outputs are needed, use comma "
|
||||
"separated string. If you want to dump everything, pass "
|
||||
"'*' as the output value.");
|
||||
CAFFE2_DEFINE_string(
|
||||
output_folder,
|
||||
"",
|
||||
"The folder that the output should be written to. This "
|
||||
"folder must already exist in the file system.");
|
||||
CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
|
||||
CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
|
||||
CAFFE2_DEFINE_bool(
|
||||
run_individual,
|
||||
false,
|
||||
"Whether to benchmark individual operators.");
|
||||
|
||||
CAFFE2_DEFINE_bool(force_engine, false, "Force engine field for all operators");
|
||||
CAFFE2_DEFINE_string(engine, "", "Forced engine field value");
|
||||
CAFFE2_DEFINE_bool(force_algo, false, "Force algo arg for all operators");
|
||||
CAFFE2_DEFINE_string(algo, "", "Forced algo arg value");
|
||||
|
||||
using std::string;
|
||||
using std::unique_ptr;
|
||||
using std::vector;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
|
||||
|
||||
// Run initialization network.
|
||||
caffe2::NetDef net_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
|
||||
CAFFE_ENFORCE(workspace->RunNetOnce(net_def));
|
||||
|
||||
// Load input.
|
||||
if (caffe2::FLAGS_input.size()) {
|
||||
vector<string> input_names = caffe2::split(',', caffe2::FLAGS_input);
|
||||
if (caffe2::FLAGS_input_file.size()) {
|
||||
vector<string> input_files = caffe2::split(',', caffe2::FLAGS_input_file);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
input_names.size(),
|
||||
input_files.size(),
|
||||
"Input name and file should have the same number.");
|
||||
for (int i = 0; i < input_names.size(); ++i) {
|
||||
caffe2::BlobProto blob_proto;
|
||||
CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
|
||||
workspace->CreateBlob(input_names[i])->Deserialize(blob_proto);
|
||||
}
|
||||
} else if (caffe2::FLAGS_input_dims.size() || caffe2::FLAGS_input_type.size()) {
|
||||
CAFFE_ENFORCE_NE(0, caffe2::FLAGS_input_dims.size(),
|
||||
"Input dims must be specified when input tensors are used.");
|
||||
CAFFE_ENFORCE_NE(0, caffe2::FLAGS_input_type.size(),
|
||||
"Input type must be specified when input tensors are used.");
|
||||
|
||||
vector<string> input_dims_list =
|
||||
caffe2::split(';', caffe2::FLAGS_input_dims);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
input_names.size(),
|
||||
input_dims_list.size(),
|
||||
"Input name and dims should have the same number of items.");
|
||||
vector<string> input_type_list =
|
||||
caffe2::split(';', caffe2::FLAGS_input_type);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
input_names.size(),
|
||||
input_type_list.size(),
|
||||
"Input name and type should have the same number of items.");
|
||||
for (size_t i = 0; i < input_names.size(); ++i) {
|
||||
vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
|
||||
vector<int> input_dims;
|
||||
for (const string& s : input_dims_str) {
|
||||
input_dims.push_back(caffe2::stoi(s));
|
||||
}
|
||||
caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
|
||||
if (blob == nullptr) {
|
||||
blob = workspace->CreateBlob(input_names[i]);
|
||||
}
|
||||
caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
|
||||
CHECK_NOTNULL(tensor);
|
||||
tensor->Resize(input_dims);
|
||||
if (input_type_list[i] == "uint8_t") {
|
||||
tensor->mutable_data<uint8_t>();
|
||||
} else if (input_type_list[i] == "float") {
|
||||
tensor->mutable_data<float>();
|
||||
} else {
|
||||
CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CAFFE_THROW(
|
||||
"You requested input tensors, but neither input_file nor "
|
||||
"input_dims is set.");
|
||||
}
|
||||
}
|
||||
|
||||
// Run main network.
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
|
||||
// force changing engine and algo
|
||||
if (caffe2::FLAGS_force_engine) {
|
||||
LOG(INFO) << "force engine be: " << caffe2::FLAGS_engine;
|
||||
for (const auto& op : net_def.op()) {
|
||||
const_cast<caffe2::OperatorDef*>(&op)->set_engine(caffe2::FLAGS_engine);
|
||||
}
|
||||
}
|
||||
if (caffe2::FLAGS_force_algo) {
|
||||
LOG(INFO) << "force algo be: " << caffe2::FLAGS_algo;
|
||||
for (const auto& op : net_def.op()) {
|
||||
caffe2::GetMutableArgument(
|
||||
"algo", true, const_cast<caffe2::OperatorDef*>(&op))
|
||||
->set_s(caffe2::FLAGS_algo);
|
||||
}
|
||||
}
|
||||
caffe2::NetBase* net = workspace->CreateNet(net_def);
|
||||
CHECK_NOTNULL(net);
|
||||
net->TEST_Benchmark(
|
||||
caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
|
||||
|
||||
string output_prefix = caffe2::FLAGS_output_folder.size()
|
||||
? caffe2::FLAGS_output_folder + "/"
|
||||
: "";
|
||||
if (caffe2::FLAGS_output.size()) {
|
||||
vector<string> output_names = caffe2::split(',', caffe2::FLAGS_output);
|
||||
if (caffe2::FLAGS_output == "*") {
|
||||
output_names = workspace->Blobs();
|
||||
}
|
||||
for (const string& name : output_names) {
|
||||
CAFFE_ENFORCE(
|
||||
workspace->HasBlob(name),
|
||||
"You requested a non-existing blob: ",
|
||||
name);
|
||||
string serialized = workspace->GetBlob(name)->Serialize(name);
|
||||
string output_filename = output_prefix + name;
|
||||
caffe2::WriteStringToFile(serialized, output_filename.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
77
binaries/split_db.cc
Normal file
77
binaries/split_db.cc
Normal file
@ -0,0 +1,77 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
CAFFE2_DEFINE_string(input_db, "", "The input db.");
|
||||
CAFFE2_DEFINE_int(splits, 0, "The number of splits.");
|
||||
CAFFE2_DEFINE_string(db_type, "", "The db type.");
|
||||
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
static int Split(int argc, char** argv) {
|
||||
GlobalInit(&argc, &argv);
|
||||
|
||||
CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
|
||||
CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
|
||||
CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");
|
||||
|
||||
unique_ptr<db::DB> in_db(
|
||||
db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
|
||||
CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
|
||||
unique_ptr<db::Cursor> cursor(in_db->NewCursor());
|
||||
// This usually won't happen, but FWIW.
|
||||
CAFFE_ENFORCE(
|
||||
cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
|
||||
|
||||
vector<unique_ptr<db::DB>> out_dbs;
|
||||
vector<unique_ptr<db::Transaction>> transactions;
|
||||
for (int i = 0; i < FLAGS_splits; ++i) {
|
||||
out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
|
||||
FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
|
||||
CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
|
||||
transactions.push_back(
|
||||
unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
|
||||
CAFFE_ENFORCE(
|
||||
transactions.back().get(), "Cannot get transaction for output db #", i);
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
for (; cursor->Valid(); cursor->Next()) {
|
||||
transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
|
||||
if (++count % FLAGS_batch_size == 0) {
|
||||
for (int i = 0; i < FLAGS_splits; ++i) {
|
||||
transactions[i]->Commit();
|
||||
}
|
||||
LOG(INFO) << "Split " << count << " items so far.";
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "A total of " << count << " items processed.";
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return caffe2::Split(argc, argv);
|
||||
}
|
89
binaries/tutorial_blob.cc
Normal file
89
binaries/tutorial_blob.cc
Normal file
@ -0,0 +1,89 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
// We will be lazy and just use the whole namespace.
|
||||
using namespace caffe2;
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
caffe2::ShowLogInfoToStderr();
|
||||
|
||||
LOG(INFO) <<
|
||||
"This script corresponds to the Blob part of the Caffe2 C++ "
|
||||
"tutorial.";
|
||||
|
||||
LOG(INFO) << "Let's create a blob myblob.";
|
||||
|
||||
Blob myblob;
|
||||
|
||||
LOG(INFO) << "Let's set it to int and set the value to 10.";
|
||||
|
||||
int* myint = myblob.GetMutable<int>();
|
||||
*myint = 10;
|
||||
|
||||
LOG(INFO)
|
||||
<< "Is the blob type int? "
|
||||
<< myblob.IsType<int>();
|
||||
|
||||
LOG(INFO)
|
||||
<< "Is the blob type float? "
|
||||
<< myblob.IsType<float>();
|
||||
|
||||
const int& myint_const = myblob.Get<int>();
|
||||
LOG(INFO)
|
||||
<< "The value of the int number stored in the blob is: "
|
||||
<< myint_const;
|
||||
|
||||
LOG(INFO)
|
||||
<< "Let's try to get a float pointer. This will trigger an exception.";
|
||||
|
||||
try {
|
||||
const float& myfloat = myblob.Get<float>();
|
||||
LOG(FATAL) << "This line should never happen.";
|
||||
} catch (std::exception& e) {
|
||||
LOG(INFO)
|
||||
<< "As expected, we got an exception. Its content says: "
|
||||
<< e.what();
|
||||
}
|
||||
|
||||
LOG(INFO) <<
|
||||
"However, we can change the content type (and destroy the old "
|
||||
"content) by calling GetMutable. Let's change it to double.";
|
||||
|
||||
double* mydouble = myblob.GetMutable<double>();
|
||||
*mydouble = 3.14;
|
||||
|
||||
LOG(INFO) << "The new content is: " << myblob.Get<double>();
|
||||
|
||||
LOG(INFO) <<
|
||||
"If we have a pre-created object, we can use Reset() to transfer the "
|
||||
"object to a blob.";
|
||||
|
||||
std::string* pvec = new std::string();
|
||||
myblob.Reset(pvec); // no need to release pvec, myblob takes ownership.
|
||||
|
||||
LOG(INFO) << "Is the blob now of type string? "
|
||||
<< myblob.IsType<std::string>();
|
||||
|
||||
LOG(INFO) << "This concludes the blob tutorial.";
|
||||
return 0;
|
||||
}
|
66
binaries/zmq_feeder.cc
Normal file
66
binaries/zmq_feeder.cc
Normal file
@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This binary provides an easy way to open a zeromq server and feeds data to
|
||||
// clients connect to it. It uses the Caffe2 db as the backend, thus allowing
|
||||
// one to convert any db-compliant storage to a zeromq service.
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/utils/zmq_helper.h"
|
||||
|
||||
CAFFE2_DEFINE_string(server, "tcp://*:5555", "The server address.");
|
||||
CAFFE2_DEFINE_string(input_db, "", "The input db.");
|
||||
CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
|
||||
|
||||
using caffe2::db::DB;
|
||||
using caffe2::db::Cursor;
|
||||
using caffe2::string;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
|
||||
LOG(INFO) << "Opening DB...";
|
||||
auto in_db = caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ);
|
||||
CAFFE_ENFORCE(
|
||||
in_db,
|
||||
"Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " +
|
||||
caffe2::FLAGS_input_db_type);
|
||||
auto cursor = in_db->NewCursor();
|
||||
LOG(INFO) << "DB opened.";
|
||||
|
||||
LOG(INFO) << "Starting ZeroMQ server...";
|
||||
|
||||
// Socket to talk to clients
|
||||
caffe2::ZmqSocket sender(ZMQ_PUSH);
|
||||
sender.Bind(caffe2::FLAGS_server);
|
||||
LOG(INFO) << "Server created at " << caffe2::FLAGS_server;
|
||||
|
||||
while (1) {
|
||||
VLOG(1) << "Sending " << cursor->key();
|
||||
sender.SendTillSuccess(cursor->key(), ZMQ_SNDMORE);
|
||||
sender.SendTillSuccess(cursor->value(), 0);
|
||||
cursor->Next();
|
||||
if (!cursor->Valid()) {
|
||||
cursor->SeekToFirst();
|
||||
}
|
||||
}
|
||||
// We do not do an elegant quit since this binary is going to be terminated by
|
||||
// control+C.
|
||||
return 0;
|
||||
}
|
0
caffe/__init__.py
Normal file
0
caffe/__init__.py
Normal file
17
caffe/proto/CMakeLists.txt
Normal file
17
caffe/proto/CMakeLists.txt
Normal file
@ -0,0 +1,17 @@
|
||||
file(GLOB Caffe_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
|
||||
|
||||
caffe2_protobuf_generate_cpp_py(Caffe_PROTO_SRCS Caffe_PROTO_HEADERS Caffe_PROTO_PY ${Caffe_PROTOBUF_FILES})
|
||||
|
||||
add_library(Caffe_PROTO OBJECT ${Caffe_PROTO_HEADERS} ${Caffe_PROTO_SRCS})
|
||||
|
||||
if (MSVC)
|
||||
if(BUILD_SHARED_LIBS)
|
||||
set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)")
|
||||
else()
|
||||
set(Caffe2_API_DEFINE "-DCAFFE2_API=")
|
||||
endif()
|
||||
target_compile_definitions(
|
||||
Caffe_PROTO PRIVATE ${Caffe2_API_DEFINE})
|
||||
endif()
|
||||
|
||||
install(FILES ${Caffe_PROTO_HEADERS} DESTINATION include/caffe/proto)
|
0
caffe/proto/__init__.py
Normal file
0
caffe/proto/__init__.py
Normal file
1399
caffe/proto/caffe.proto
Normal file
1399
caffe/proto/caffe.proto
Normal file
File diff suppressed because it is too large
Load Diff
87
caffe2/.clang-format
Normal file
87
caffe2/.clang-format
Normal file
@ -0,0 +1,87 @@
|
||||
---
|
||||
AccessModifierOffset: -1
|
||||
AlignAfterOpenBracket: AlwaysBreak
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlinesLeft: true
|
||||
AlignOperands: false
|
||||
AlignTrailingComments: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: false
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: Empty
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
AlwaysBreakTemplateDeclarations: true
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false
|
||||
BraceWrapping:
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Attach
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: false
|
||||
ColumnLimit: 80
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: true
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ]
|
||||
IncludeCategories:
|
||||
- Regex: '^<.*\.h(pp)?>'
|
||||
Priority: 1
|
||||
- Regex: '^<.*'
|
||||
Priority: 2
|
||||
- Regex: '.*'
|
||||
Priority: 3
|
||||
IndentCaseLabels: true
|
||||
IndentWidth: 2
|
||||
IndentWrappedFunctionNames: false
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBlockIndentWidth: 2
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: false
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Left
|
||||
ReflowComments: true
|
||||
SortIncludes: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Cpp11
|
||||
TabWidth: 8
|
||||
UseTab: Never
|
||||
...
|
310
caffe2/CMakeLists.txt
Normal file
310
caffe2/CMakeLists.txt
Normal file
@ -0,0 +1,310 @@
|
||||
# ---[ Declare source file lists
|
||||
|
||||
# ---[ Add respective subdirectories
|
||||
# Note: the folders that are being commented out have not been properly
|
||||
# addressed yet.
|
||||
|
||||
add_subdirectory(proto)
|
||||
|
||||
add_subdirectory(contrib)
|
||||
add_subdirectory(core)
|
||||
add_subdirectory(core/nomnigraph)
|
||||
add_subdirectory(cuda_rtc)
|
||||
add_subdirectory(db)
|
||||
add_subdirectory(distributed)
|
||||
# add_subdirectory(experiments) # note, we may remove this folder at some point
|
||||
add_subdirectory(image)
|
||||
add_subdirectory(video)
|
||||
add_subdirectory(mkl)
|
||||
add_subdirectory(mobile)
|
||||
add_subdirectory(mpi)
|
||||
add_subdirectory(observers)
|
||||
add_subdirectory(onnx)
|
||||
add_subdirectory(operators)
|
||||
add_subdirectory(operators/rnn)
|
||||
add_subdirectory(perfkernels)
|
||||
add_subdirectory(python)
|
||||
add_subdirectory(queue)
|
||||
add_subdirectory(sgd)
|
||||
add_subdirectory(share)
|
||||
# add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
|
||||
add_subdirectory(transforms)
|
||||
add_subdirectory(utils)
|
||||
|
||||
# Advanced: if we have white list specified, we will do intersections for all
|
||||
# main lib srcs.
|
||||
if (CAFFE2_WHITELISTED_FILES)
|
||||
caffe2_do_whitelist(Caffe2_CPU_SRCS CAFFE2_WHITELISTED_FILES)
|
||||
caffe2_do_whitelist(Caffe2_GPU_SRCS CAFFE2_WHITELISTED_FILES)
|
||||
endif()
|
||||
|
||||
# Debug messages - if you want to get a list of source files, enable the
|
||||
# following.
|
||||
if (FALSE)
|
||||
message(STATUS "CPU sources: ")
|
||||
foreach(tmp ${Caffe2_CPU_SRCS})
|
||||
message(STATUS " " ${tmp})
|
||||
endforeach()
|
||||
|
||||
message(STATUS "GPU sources: ")
|
||||
foreach(tmp ${Caffe2_GPU_SRCS})
|
||||
message(STATUS " " ${tmp})
|
||||
endforeach()
|
||||
|
||||
message(STATUS "CPU test sources: ")
|
||||
foreach(tmp ${Caffe2_CPU_TEST_SRCS})
|
||||
message(STATUS " " ${tmp})
|
||||
endforeach()
|
||||
|
||||
message(STATUS "GPU test sources: ")
|
||||
foreach(tmp ${Caffe2_GPU_TEST_SRCS})
|
||||
message(STATUS " " ${tmp})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# ---[ Generate and install header files.
|
||||
|
||||
# Write the macros file.
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/caffe2/core/macros.h.in
|
||||
${PROJECT_BINARY_DIR}/caffe2/core/macros.h)
|
||||
|
||||
# Installing the header files
|
||||
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
|
||||
DESTINATION include
|
||||
FILES_MATCHING PATTERN "*.h")
|
||||
install(FILES ${PROJECT_BINARY_DIR}/caffe2/core/macros.h
|
||||
DESTINATION include/caffe2/core)
|
||||
|
||||
|
||||
# ---[ List of libraries to link with
|
||||
add_library(caffe2_protos STATIC $<TARGET_OBJECTS:Caffe_PROTO> $<TARGET_OBJECTS:Caffe2_PROTO>)
|
||||
add_dependencies(caffe2_protos Caffe_PROTO Caffe2_PROTO)
|
||||
# If we are going to link protobuf locally inside caffe2 libraries, what we will do is
|
||||
# to create a helper static library that always contains libprotobuf source files, and
|
||||
# link the caffe2 related dependent libraries to it.
|
||||
target_include_directories(caffe2_protos INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
# Reason for this public dependency is as follows:
|
||||
# (1) Strictly speaking, we should not expose any Protobuf related functions. We should
|
||||
# only use function interfaces wrapped with our own public API, and link protobuf
|
||||
# locally.
|
||||
# (2) However, currently across the Caffe2 codebase, we have extensive use of protobuf
|
||||
# functionalities. For example, not only libcaffe2.so uses it, but also other
|
||||
# binaries such as python extensions etc. As a result, we will have to have a
|
||||
# transitive dependency to libprotobuf.
|
||||
#
|
||||
# Good thing is that, if we specify CAFFE2_LINK_LOCAL_PROTOBUF, then we do not need to
|
||||
# separately deploy protobuf binaries - libcaffe2.so will contain all functionalities
|
||||
# one needs. One can verify this via ldd.
|
||||
#
|
||||
# TODO item in the future includes:
|
||||
# (1) Enable using lite protobuf
|
||||
# (2) Properly define public API that do not directly depend on protobuf itself.
|
||||
# (3) Expose the libprotobuf.a file for dependent libraries to link to.
|
||||
#
|
||||
# What it means for users/developers?
|
||||
# (1) Users: nothing affecting the users, other than the fact that CAFFE2_LINK_LOCAL_PROTOBUF
|
||||
# avoids the need to deploy protobuf.
|
||||
# (2) Developers: if one simply uses core caffe2 functionality without using protobuf,
|
||||
# nothing changes. If one has a dependent library that uses protobuf, then one needs to
|
||||
# have the right protobuf version as well as linking to libprotobuf.a.
|
||||
target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf)
|
||||
|
||||
# Compile exposed libraries.
|
||||
add_library(caffe2 ${Caffe2_CPU_SRCS})
|
||||
caffe2_interface_library(caffe2_protos caffe2_protos_whole)
|
||||
target_link_libraries(caffe2 PRIVATE caffe2_protos_whole)
|
||||
if (${CAFFE2_LINK_LOCAL_PROTOBUF})
|
||||
target_link_libraries(caffe2 INTERFACE protobuf::libprotobuf)
|
||||
else()
|
||||
target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
|
||||
endif()
|
||||
target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
|
||||
target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
|
||||
target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
|
||||
target_include_directories(caffe2 INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
target_compile_options(caffe2 INTERFACE "-std=c++11")
|
||||
target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
|
||||
# Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
|
||||
target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
|
||||
install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
|
||||
caffe2_interface_library(caffe2 caffe2_library)
|
||||
list(APPEND Caffe2_MAIN_LIBS caffe2_library)
|
||||
|
||||
# ---[ CUDA library.
|
||||
if(USE_CUDA)
|
||||
# A hack to deal with cuda library dependencies and modern CMake: the
|
||||
# CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
|
||||
# one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
|
||||
# hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with
|
||||
# it. We will then manually add the cudart library as interface libs.
|
||||
set(__tmp ${CUDA_LIBRARIES})
|
||||
set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
|
||||
CUDA_ADD_LIBRARY(caffe2_gpu ${Caffe2_GPU_SRCS})
|
||||
set(CUDA_LIBRARIES ${__tmp})
|
||||
target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart)
|
||||
|
||||
target_include_directories(
|
||||
caffe2_gpu INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
target_link_libraries(
|
||||
caffe2_gpu PUBLIC caffe2 ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
|
||||
target_link_libraries(
|
||||
caffe2_gpu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
|
||||
caffe2_interface_library(caffe2_gpu caffe2_gpu_library)
|
||||
list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library)
|
||||
install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib)
|
||||
endif()
|
||||
|
||||
# ---[ Test binaries.
|
||||
if (BUILD_TEST)
|
||||
set(Caffe2_ALL_TEST_SRCS ${Caffe2_CPU_TEST_SRCS})
|
||||
if (USE_CUDA)
|
||||
list(APPEND Caffe2_ALL_TEST_SRCS ${Caffe2_GPU_TEST_SRCS})
|
||||
endif()
|
||||
|
||||
foreach(test_src ${Caffe2_ALL_TEST_SRCS})
|
||||
get_filename_component(test_name ${test_src} NAME_WE)
|
||||
add_executable(${test_name} "${test_src}")
|
||||
# For tests, some of the test code actually directly call the dependent
|
||||
# libraries even if they are not part of the public dependency libs. As a
|
||||
# result, we will explicitly link the test against the Caffe2 dependency
|
||||
# libs.
|
||||
target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
|
||||
if (USE_CUDA)
|
||||
target_link_libraries(${test_name} ${Caffe2_CUDA_DEPENDENCY_LIBS})
|
||||
endif()
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
|
||||
target_compile_features(${test_name} PRIVATE cxx_range_for)
|
||||
endif()
|
||||
add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
|
||||
install(TARGETS ${test_name} DESTINATION test)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
|
||||
if (BUILD_PYTHON)
|
||||
# Python site-packages
|
||||
# Get canonical directory for python site packages (relative to install
|
||||
# location). It varys from system to system.
|
||||
pycmd(PYTHON_SITE_PACKAGES "
|
||||
from distutils import sysconfig
|
||||
print(sysconfig.get_python_lib(prefix=''))
|
||||
")
|
||||
# ---[ Options.
|
||||
SET(PYTHON_LIB_REL_PATH "${PYTHON_SITE_PACKAGES}" CACHE STRING "Python installation path (relative to CMake installation prefix)")
|
||||
message(STATUS "Using ${PYTHON_LIB_REL_PATH} as python relative installation path")
|
||||
# Python extension suffix
|
||||
# Try to get from python through sysconfig.get_env_var('EXT_SUFFIX') first,
|
||||
# fallback to ".pyd" if windows and ".so" for all others.
|
||||
pycmd(PY_EXT_SUFFIX "
|
||||
from distutils import sysconfig
|
||||
ext_suffix = sysconfig.get_config_var('EXT_SUFFIX')
|
||||
print(ext_suffix if ext_suffix else '')
|
||||
")
|
||||
if("${PY_EXT_SUFFIX}" STREQUAL "")
|
||||
if (MSVC)
|
||||
set(PY_EXT_SUFFIX ".pyd")
|
||||
else()
|
||||
set(PY_EXT_SUFFIX ".so")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ Python.
|
||||
add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "")
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
|
||||
if (APPLE)
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
|
||||
endif()
|
||||
set_target_properties(
|
||||
caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
target_link_libraries(
|
||||
caffe2_pybind11_state caffe2_library)
|
||||
install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
|
||||
|
||||
if(USE_CUDA)
|
||||
add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS})
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "")
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
|
||||
if (APPLE)
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
|
||||
endif()
|
||||
set_target_properties(
|
||||
caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
target_link_libraries(
|
||||
caffe2_pybind11_state_gpu caffe2_library caffe2_gpu_library)
|
||||
install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
|
||||
endif()
|
||||
|
||||
if (MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
# If we are building under windows, we will copy the file from
|
||||
# build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd
|
||||
# to its parent folder so that we can do in-build execution.
|
||||
add_custom_target(windows_python_copy_lib ALL)
|
||||
add_dependencies(windows_python_copy_lib caffe2_pybind11_state)
|
||||
add_custom_command(
|
||||
TARGET windows_python_copy_lib POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
$<TARGET_FILE:caffe2_pybind11_state>
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
if (USE_CUDA)
|
||||
add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu)
|
||||
add_custom_command(
|
||||
TARGET windows_python_copy_lib POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
$<TARGET_FILE:caffe2_pybind11_state_gpu>
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Finally, Copy all python files to build directory
|
||||
# Generate and create all needed __init__.py files, if they aren't already
|
||||
# present in the current source tree.
|
||||
message(STATUS "Automatically generating missing __init__.py files.")
|
||||
caffe_autogen_init_py_files()
|
||||
|
||||
# Create a custom target that copies all python files.
|
||||
file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR}
|
||||
"${PROJECT_SOURCE_DIR}/caffe2/*.py")
|
||||
add_custom_target(python_copy_files ALL)
|
||||
if(MSVC OR CMAKE_GENERATOR MATCHES "Ninja")
|
||||
# ninja fails when the command line is too long so we split
|
||||
# the target into several. This would be beneficial for VS also
|
||||
# since it build targets in parallel but not custom commands
|
||||
foreach(python_src ${PYTHON_SRCS})
|
||||
get_filename_component(dir ${python_src} DIRECTORY)
|
||||
string(SHA1 name_hash "${python_src}")
|
||||
# get_filename_component(name_we ${python_src} NAME_WE)
|
||||
add_custom_target(python_copy_files_${name_hash}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
|
||||
add_dependencies(python_copy_files python_copy_files_${name_hash})
|
||||
endforeach()
|
||||
else()
|
||||
foreach(python_src ${PYTHON_SRCS})
|
||||
get_filename_component(dir ${python_src} DIRECTORY)
|
||||
add_custom_command(
|
||||
TARGET python_copy_files PRE_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# Install commands
|
||||
# Pick up static python files
|
||||
install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
|
||||
FILES_MATCHING PATTERN "*.py")
|
||||
# Caffe proto files
|
||||
install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe DESTINATION ${PYTHON_LIB_REL_PATH}
|
||||
FILES_MATCHING PATTERN "*.py")
|
||||
# Caffe2 proto files
|
||||
install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
|
||||
FILES_MATCHING PATTERN "*.py")
|
||||
endif()
|
||||
|
||||
# Finally, set the Caffe2_MAIN_LIBS variable in the parent scope.
|
||||
set(Caffe2_MAIN_LIBS ${Caffe2_MAIN_LIBS} PARENT_SCOPE)
|
21
caffe2/README.md
Normal file
21
caffe2/README.md
Normal file
@ -0,0 +1,21 @@
|
||||
# Caffe2
|
||||
|
||||
[](https://ci.pytorch.org/jenkins/job/caffe2-master)
|
||||
|
||||
Caffe2 is a lightweight, modular, and scalable deep learning framework. Building on the original [Caffe](http://caffe.berkeleyvision.org), Caffe2 is designed with expression, speed, and modularity in mind.
|
||||
|
||||
## Questions and Feedback
|
||||
|
||||
Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features.
|
||||
|
||||
### Further Resources on [Caffe2.ai](http://caffe2.ai)
|
||||
|
||||
* [Installation](http://caffe2.ai/docs/getting-started.html)
|
||||
* [Learn More](http://caffe2.ai/docs/learn-more.html)
|
||||
* [Upgrading to Caffe2](http://caffe2.ai/docs/caffe-migration.html)
|
||||
* [Datasets](http://caffe2.ai/docs/datasets.html)
|
||||
* [Model Zoo](http://caffe2.ai/docs/zoo.html)
|
||||
* [Tutorials](http://caffe2.ai/docs/tutorials.html)
|
||||
* [Operators Catalogue](http://caffe2.ai/docs/operators-catalogue.html)
|
||||
* [C++ API](http://caffe2.ai/doxygen-c/html/classes.html)
|
||||
* [Python API](http://caffe2.ai/doxygen-python/html/namespaces.html)
|
1
caffe2/VERSION_NUMBER
Normal file
1
caffe2/VERSION_NUMBER
Normal file
@ -0,0 +1 @@
|
||||
0.8.2
|
0
caffe2/__init__.py
Normal file
0
caffe2/__init__.py
Normal file
17
caffe2/contrib/CMakeLists.txt
Normal file
17
caffe2/contrib/CMakeLists.txt
Normal file
@ -0,0 +1,17 @@
|
||||
add_subdirectory(aten)
|
||||
add_subdirectory(gloo)
|
||||
add_subdirectory(nccl)
|
||||
add_subdirectory(prof)
|
||||
add_subdirectory(shm_mutex)
|
||||
add_subdirectory(script)
|
||||
# Finally pass the src lists back to the parent
|
||||
|
||||
# CPU source, test sources, binary sources
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
|
||||
|
||||
# GPU source, test sources, binary sources
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
|
0
caffe2/contrib/__init__.py
Normal file
0
caffe2/contrib/__init__.py
Normal file
29
caffe2/contrib/aten/CMakeLists.txt
Normal file
29
caffe2/contrib/aten/CMakeLists.txt
Normal file
@ -0,0 +1,29 @@
|
||||
if(USE_ATEN)
|
||||
if(NOT USE_CUDA)
|
||||
set(NO_CUDA ON)
|
||||
endif()
|
||||
set(TORCH_CUDA_ARCH_LIST "3.5 5.2 6.0 6.1+PTX")
|
||||
set(TORCH_NVCC_FLAGS "-Xfatbin -compress-all")
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
set(AT_LINK_STYLE STATIC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/aten aten EXCLUDE_FROM_ALL)
|
||||
|
||||
add_custom_command(OUTPUT aten_op.h
|
||||
COMMAND
|
||||
python ${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py
|
||||
--third_party_root=${PROJECT_SOURCE_DIR}/third_party
|
||||
--template_dir=${PROJECT_SOURCE_DIR}/caffe2/contrib/aten
|
||||
DEPENDS
|
||||
ATen
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/aten_op_template.h)
|
||||
|
||||
add_custom_target(__aten_op_header_gen DEPENDS aten_op.h)
|
||||
add_library(aten_op_header_gen INTERFACE)
|
||||
add_dependencies(aten_op_header_gen __aten_op_header_gen)
|
||||
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc" PARENT_SCOPE)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc" PARENT_SCOPE)
|
||||
endif()
|
80
caffe2/contrib/aten/README.md
Normal file
80
caffe2/contrib/aten/README.md
Normal file
@ -0,0 +1,80 @@
|
||||
# An ATen operator for Caffe2
|
||||
|
||||
[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch
|
||||
and PyTorch directly in C++11. This library provides a generated wrapper around the ATen API
|
||||
that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
|
||||
ToffeeIR.
|
||||
|
||||
|
||||
### Example Usage in Caffe2
|
||||
|
||||
First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
|
||||
[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
|
||||
|
||||
We will call the `pow` operator:
|
||||
|
||||
```
|
||||
static inline Tensor pow(const Tensor & self, Scalar exponent);
|
||||
```
|
||||
|
||||
Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`,
|
||||
and there is always a string attribute `operator` that defines which ATen function to call:
|
||||
|
||||
|
||||
```
|
||||
import numpy as np
|
||||
from caffe2.python import core, workspace
|
||||
|
||||
|
||||
# create the Caffe2 Op:
|
||||
op = core.CreateOperator(
|
||||
"ATen",
|
||||
["MyInput"],
|
||||
["MyOutput"],
|
||||
operator="pow", exponent=2.0)
|
||||
|
||||
```
|
||||
|
||||
Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob.
|
||||
Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes.
|
||||
In the case of `Scalar` the attributes can be either an integers or floating point numbers.
|
||||
|
||||
The op can now be run like any other Caffe2 operator:
|
||||
|
||||
```
|
||||
workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32))
|
||||
workspace.RunOperatorOnce(op)
|
||||
print(workspace.FetchBlob("MyOutput")
|
||||
```
|
||||
|
||||
For methods, the first input is always the `this` Tensor in C++.
|
||||
To call methods of ATen's `Type` objects, you provide an additional string attribute
|
||||
that determines the type:
|
||||
|
||||
```
|
||||
# create a 2x4 tensor filled with floating point ones
|
||||
op = core.CreateOperator(
|
||||
"ATen",
|
||||
[],
|
||||
["MyOutput"],
|
||||
operator="ones", type="Float", size={2,4})
|
||||
```
|
||||
|
||||
Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
|
||||
|
||||
### Example Usage via PyTorch Symbolic
|
||||
|
||||
The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported
|
||||
to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API:
|
||||
|
||||
```
|
||||
class Add(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def symbolic(g, a, b):
|
||||
return g.op("ATen", a, b, operator_s = "add")
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
return a + b
|
||||
```
|
22
caffe2/contrib/aten/aten_op.cc
Normal file
22
caffe2/contrib/aten/aten_op.cc
Normal file
@ -0,0 +1,22 @@
|
||||
#include "caffe2/contrib/aten/aten_op.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
|
||||
template<>
|
||||
at::Backend ATenOp<CPUContext>::backend() const {
|
||||
return at::kCPU;
|
||||
}
|
||||
|
||||
OPERATOR_SCHEMA(ATen);
|
||||
CAFFE_KNOWN_TYPE(at::Half);
|
||||
|
||||
namespace math {
|
||||
template<>
|
||||
void Set<at::Half,CPUContext>(const size_t N, const at::Half h, at::Half* v, CPUContext * c) {
|
||||
Set(0, h.x, (uint16_t*) v, c);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
1
caffe2/contrib/aten/aten_op.h
Normal file
1
caffe2/contrib/aten/aten_op.h
Normal file
@ -0,0 +1 @@
|
||||
#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"
|
19
caffe2/contrib/aten/aten_op_cuda.cc
Normal file
19
caffe2/contrib/aten/aten_op_cuda.cc
Normal file
@ -0,0 +1,19 @@
|
||||
#include "caffe2/contrib/aten/aten_op.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
|
||||
template<>
|
||||
at::Backend ATenOp<CUDAContext>::backend() const {
|
||||
return at::kCUDA;
|
||||
}
|
||||
|
||||
namespace math {
|
||||
template<>
|
||||
void Set<at::Half,CUDAContext>(const size_t N, const at::Half h, at::Half* v, CUDAContext * c) {
|
||||
Set(0, h.x, (uint16_t*) v, c);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
218
caffe2/contrib/aten/aten_op_template.h
Normal file
218
caffe2/contrib/aten/aten_op_template.h
Normal file
@ -0,0 +1,218 @@
|
||||
#pragma once
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <ATen/ATen.h>
|
||||
#include <caffe2/core/context.h>
|
||||
#include <caffe2/core/operator.h>
|
||||
#include <caffe2/utils/math.h>
|
||||
#include <iostream>
|
||||
|
||||
// a map from descriptor strings (see [DESCRIPTORS])
|
||||
// to the key in the switch statement that implements them
|
||||
static std::unordered_map<std::string, int> op_to_key = {
|
||||
${mappings}
|
||||
};
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using at::Half; // for AT_FORALL_SCALAR_TYPES
|
||||
|
||||
template <class Context>
|
||||
class ATenOp : public Operator<Context> {
|
||||
public:
|
||||
ATenOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws) {
|
||||
VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n";
|
||||
switch(findImplementation(operator_def)) {
|
||||
${implementations}
|
||||
default:
|
||||
CAFFE_THROW("Unexpected key value for aten operator");
|
||||
}
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
return run_op();
|
||||
}
|
||||
private:
|
||||
// actual operator implementation is initialized in ctor.
|
||||
std::function<bool()> run_op;
|
||||
at::Backend backend() const;
|
||||
|
||||
TypeMeta typeMetaFor(const at::Tensor & t) {
|
||||
return typeMetaFor(t.type().scalarType());
|
||||
}
|
||||
TypeMeta typeMetaFor(at::ScalarType st) {
|
||||
#define DEFINE_CASE(ctype,aten_name,_) \
|
||||
case at::k##aten_name: \
|
||||
return TypeMeta::Make<ctype>();
|
||||
switch(st) {
|
||||
AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
|
||||
default:
|
||||
CAFFE_THROW("Unknown ATen Type");
|
||||
}
|
||||
#undef DEFINE_CASE
|
||||
}
|
||||
|
||||
at::Type & typeFor(const Tensor<Context> & ten) {
|
||||
return at::getType(backend(), atScalarTypeFor(ten.meta()));
|
||||
}
|
||||
at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
|
||||
auto& ten = const_cast<Tensor<Context>&>(ten_);
|
||||
return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
|
||||
}
|
||||
at::Tensor loadInput(size_t i) {
|
||||
return tensorWrapping(Input(i));
|
||||
}
|
||||
std::vector<at::Tensor> loadInputsAtOffset(size_t s) {
|
||||
std::vector<at::Tensor> results;
|
||||
for (size_t i = s; i < InputSize(); i++) {
|
||||
results.push_back(loadInput(i));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
at::ScalarType atScalarTypeFor(const TypeMeta & meta) {
|
||||
#define DEFINE_IF(ctype,aten_name,_) \
|
||||
if(meta.Match<ctype>()) { \
|
||||
return at::k##aten_name; \
|
||||
}
|
||||
AT_FORALL_SCALAR_TYPES(DEFINE_IF)
|
||||
#undef DEFINE_IF
|
||||
CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
|
||||
}
|
||||
void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
|
||||
at::Tensor src = src_.contiguous();
|
||||
auto at_sizes = src.sizes();
|
||||
std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
|
||||
dst->Resize(dims);
|
||||
dst->ShareExternalPointer(
|
||||
src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable {
|
||||
// return a closure that holds a handle to t until it is called
|
||||
// to keep the aten memory alive
|
||||
return src.reset();
|
||||
});
|
||||
}
|
||||
void assignListStartingAt(
|
||||
size_t offset,
|
||||
const std::vector<at::Tensor>& tensors) {
|
||||
for (size_t i = 0; i < tensors.size(); i++) {
|
||||
assignTo(Output(offset + i), tensors[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// the AT_FORALL_SCALAR_TYPES macro just gives a 'i' or 'd' argument
|
||||
// for each type to specify if it is stored as a integer or a double.
|
||||
// We need this workaround here to extract the value in the scalar losslessly
|
||||
// because in some cases like 'sum' Torch promotes float to double
|
||||
// and will complain if we downcast it with toFloat, causing it
|
||||
// to lose precision
|
||||
double extract_d(const at::Scalar & s) {
|
||||
return s.toDouble();
|
||||
}
|
||||
int64_t extract_i(const at::Scalar & s) {
|
||||
return s.toLong();
|
||||
}
|
||||
|
||||
void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
|
||||
switch(inferred_type.scalarType()) {
|
||||
#define DEFINE_CASE(ctype,aten_name,native) \
|
||||
case at::k##aten_name: { \
|
||||
auto value = extract_##native(scalar); \
|
||||
assignToValue<ctype>(dst, at::convert<ctype,decltype(value)>(value)); \
|
||||
} break;
|
||||
AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
|
||||
#undef DEFINE_CASE
|
||||
default:
|
||||
CAFFE_THROW("Unknown ATen Type");
|
||||
}
|
||||
}
|
||||
template<typename T>
|
||||
void assignToValue(Tensor<Context> * dst, T v) {
|
||||
dst->Resize(std::vector<TIndex>());
|
||||
math::Set(1, v, dst->template mutable_data<T>(), &context_);
|
||||
}
|
||||
int findImplementation(const OperatorDef& operator_def) {
|
||||
CAFFE_ENFORCE(HasArgument("operator"));
|
||||
std::string op = OperatorBase::GetSingleArgument<std::string>("operator", "");
|
||||
// construct descriptor string ([DESCRIPTORS]) given the attributes
|
||||
// and inputs of this operator_def, and look up the implementation key
|
||||
// for this variant
|
||||
std::stringstream descriptor;
|
||||
descriptor << op;
|
||||
std::vector<std::string> attrs;
|
||||
for(size_t i = 0; i < operator_def.arg_size(); i++) {
|
||||
auto & attr = operator_def.arg(i);
|
||||
if(attr.name() == "operator" || attr.name() == "type" )
|
||||
continue;
|
||||
attrs.push_back(attr.name());
|
||||
}
|
||||
std::sort(attrs.begin(), attrs.end());
|
||||
for(auto & a : attrs)
|
||||
descriptor << "-" << a;
|
||||
|
||||
std::string descriptor_sized =
|
||||
descriptor.str() + "-" + caffe2::to_string(InputSize());
|
||||
std::string descriptor_var_args = descriptor.str() + "-*";
|
||||
if (op_to_key.count(descriptor_sized) > 0) {
|
||||
return op_to_key[descriptor_sized];
|
||||
}
|
||||
if (op_to_key.count(descriptor_var_args) > 0) {
|
||||
return op_to_key[descriptor_var_args];
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << "Attempting to run unknown ATen operator configuration: "
|
||||
<< descriptor_sized;
|
||||
CAFFE_THROW(ss.str());
|
||||
}
|
||||
at::Scalar readScalarAttribute(const std::string & name) {
|
||||
if(OperatorBase::HasSingleArgumentOfType<int64_t>(name)) {
|
||||
return OperatorBase::GetSingleArgument<int64_t>(name, 0);
|
||||
} else {
|
||||
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<float>(name));
|
||||
return OperatorBase::GetSingleArgument<float>(name, 0);
|
||||
}
|
||||
}
|
||||
template<typename T>
|
||||
T readAttribute(const std::string & name) {
|
||||
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>(name));
|
||||
return OperatorBase::GetSingleArgument<T>(name, 0);
|
||||
}
|
||||
std::vector<int64_t> readIntList(const std::string & name) {
|
||||
CAFFE_ENFORCE(OperatorBase::HasArgument(name));
|
||||
return OperatorBase::GetRepeatedArgument<int64_t>(name, {});
|
||||
}
|
||||
template <int N>
|
||||
std::array<bool, N> readBoolMask(const std::string& name) {
|
||||
CAFFE_ENFORCE(OperatorBase::HasArgument(name));
|
||||
std::vector<int64_t> ints =
|
||||
OperatorBase::GetRepeatedArgument<int64_t>(name, {});
|
||||
std::array<bool, N> result;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
result[i] = ints.at(i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
at::ScalarType stringToScalarType(const std::string & name) {
|
||||
#define DEFINE_IF(type,aten) \
|
||||
if(#type == name) \
|
||||
return at::k##aten;
|
||||
DEFINE_IF(float16, Half)
|
||||
DEFINE_IF(float, Float)
|
||||
DEFINE_IF(double, Double)
|
||||
DEFINE_IF(uint8, Byte)
|
||||
DEFINE_IF(int8, Char)
|
||||
DEFINE_IF(int16, Short)
|
||||
DEFINE_IF(int32, Int)
|
||||
DEFINE_IF(int64, Long)
|
||||
CAFFE_THROW("unsupported type annotation: ", name);
|
||||
}
|
||||
at::Type & stringToType(const std::string & name) {
|
||||
return at::getType(backend(), stringToScalarType(name));
|
||||
}
|
||||
at::Type * readTypeAttribute(const std::string & name) {
|
||||
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<std::string>(name));
|
||||
return &stringToType(OperatorBase::GetSingleArgument<std::string>(name, ""));
|
||||
}
|
||||
};
|
||||
|
||||
}
|
86
caffe2/contrib/aten/aten_test.py
Normal file
86
caffe2/contrib/aten/aten_test.py
Normal file
@ -0,0 +1,86 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, dyndep
|
||||
from hypothesis import given
|
||||
|
||||
import caffe2.python.hypothesis_test_util as hu
|
||||
import hypothesis.strategies as st
|
||||
import numpy as np
|
||||
|
||||
|
||||
dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/aten:aten_op')
|
||||
|
||||
|
||||
class TestATen(hu.HypothesisTestCase):
|
||||
|
||||
@given(inputs=hu.tensors(n=2), **hu.gcs)
|
||||
def test_add(self, inputs, gc, dc):
|
||||
op = core.CreateOperator(
|
||||
"ATen",
|
||||
["X", "Y"],
|
||||
["Z"],
|
||||
operator="add")
|
||||
|
||||
def ref(X, Y):
|
||||
return [X + Y]
|
||||
self.assertReferenceChecks(gc, op, inputs, ref)
|
||||
|
||||
@given(inputs=hu.tensors(n=1), **hu.gcs)
|
||||
def test_pow(self, inputs, gc, dc):
|
||||
op = core.CreateOperator(
|
||||
"ATen",
|
||||
["S"],
|
||||
["Z"],
|
||||
operator="pow", exponent=2.0)
|
||||
|
||||
def ref(X):
|
||||
return [np.square(X)]
|
||||
|
||||
self.assertReferenceChecks(gc, op, inputs, ref)
|
||||
|
||||
@given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
|
||||
def test_sort(self, x, gc, dc):
|
||||
inputs = [np.random.permutation(x)]
|
||||
op = core.CreateOperator(
|
||||
"ATen",
|
||||
["S"],
|
||||
["Z", "I"],
|
||||
operator="sort")
|
||||
|
||||
def ref(X):
|
||||
return [np.sort(X), np.argsort(X)]
|
||||
self.assertReferenceChecks(gc, op, inputs, ref)
|
||||
|
||||
@given(inputs=hu.tensors(n=1), **hu.gcs)
|
||||
def test_sum(self, inputs, gc, dc):
|
||||
op = core.CreateOperator(
|
||||
"ATen",
|
||||
["S"],
|
||||
["Z"],
|
||||
operator="sum")
|
||||
|
||||
def ref(X):
|
||||
return [np.sum(X)]
|
||||
|
||||
self.assertReferenceChecks(gc, op, inputs, ref)
|
||||
|
||||
@given(**hu.gcs)
|
||||
def test_ones(self, gc, dc):
|
||||
op = core.CreateOperator(
|
||||
"ATen",
|
||||
[],
|
||||
["Z"],
|
||||
operator="ones", type="float", size={2, 4})
|
||||
|
||||
def ref():
|
||||
return [np.ones([2, 4])]
|
||||
|
||||
self.assertReferenceChecks(gc, op, [], ref)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import unittest
|
||||
unittest.main()
|
157
caffe2/contrib/aten/docs/pytorch_to_caffe2.md
Normal file
157
caffe2/contrib/aten/docs/pytorch_to_caffe2.md
Normal file
@ -0,0 +1,157 @@
|
||||
# Using ONNX and ATen to export models from PyTorch to Caffe2
|
||||
|
||||
When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up
|
||||
hitting operators that are not yet part of the ONNX specification. These may be
|
||||
operators that haven't been standardized yet, or custom `torch.autograd.Function` types that
|
||||
are specific to a network.
|
||||
|
||||
To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
|
||||
[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten)
|
||||
that can run these tensor functions in a Caffe2 network after importing them through ONNX.
|
||||
|
||||
This guide explains how to configure Caffe2 and modify your PyTorch program to use
|
||||
this functionality.
|
||||
|
||||
### Enable ATen in Caffe2
|
||||
|
||||
The ATen facility in Caffe2 is part of a contrib package and needs to be enabled
|
||||
when you configure Caffe2 using cmake:
|
||||
|
||||
```
|
||||
git clone https://github.com/caffe2/caffe2/
|
||||
mkdir caffe2/build
|
||||
cd caffe2/build
|
||||
cmake -DUSE_ATEN=ON <other build options> ..
|
||||
make install
|
||||
```
|
||||
|
||||
### Describe How to Export a PyTorch Autograd Function using ATen
|
||||
|
||||
To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run
|
||||
in the forward pass of a network. For each function in the trace, it calls that function's
|
||||
`symbolic` method which describes how to construct the part of the ONNX graph
|
||||
that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/master/torch/autograd/_functions/basic_ops.py#L59) for examples).
|
||||
|
||||
When equivalent ONNX operators do not exist, you can instead call any ATen function.
|
||||
As an example let's assume we have an autograd function which computes `x*x+y`:
|
||||
|
||||
```
|
||||
class MyFunction(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
return x*x + y
|
||||
```
|
||||
|
||||
We can add a `symbolic` method to it like so:
|
||||
|
||||
```
|
||||
class MyFunction(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
return x*x + y
|
||||
@staticmethod
|
||||
def symbolic(graph, x, y):
|
||||
x2 = graph.at("mul", x, x)
|
||||
r = graph.at("add", x2, y)
|
||||
# x, y, x2, and r are 'Node' objects
|
||||
# print(r) or print(graph) will print out a textual representation for debugging.
|
||||
# this representation will be converted to ONNX protobufs on export.
|
||||
return r
|
||||
```
|
||||
|
||||
The function `graph.at` adds a new ATen op the computation graph.
|
||||
You can call any ATen function using this facility. To do so,
|
||||
first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
|
||||
[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
|
||||
|
||||
As an example, we might want to call the `pow` operator:
|
||||
|
||||
```
|
||||
static inline Tensor pow(const Tensor & self, Scalar exponent);
|
||||
```
|
||||
|
||||
We can translate this into the equivalent `graph.at` function:
|
||||
|
||||
```
|
||||
def symbolic(graph, x):
|
||||
graph.at("pow", x, exponent_f = 2.0) # compute x**2
|
||||
```
|
||||
|
||||
Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar`
|
||||
like `exponent` becomes a keyword argument that specify ONNX attributes.
|
||||
Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings).
|
||||
|
||||
For methods, the first input is always the `this` Tensor in C++.
|
||||
To call methods of ATen's `Type` objects, you provide an additional string attribute
|
||||
that determines the type. For instance, `ones` creates a new constant tensor of all ones:
|
||||
```
|
||||
class Type {
|
||||
...
|
||||
virtual Tensor ones(IntList size) const;
|
||||
...
|
||||
};
|
||||
```
|
||||
|
||||
From PyTorch it can be created by adding the type as an additional attribute:
|
||||
|
||||
```
|
||||
def symbolic(graph, x):
|
||||
return graph.at("ones", type_s="float", size_i=[2,4])
|
||||
```
|
||||
|
||||
|
||||
Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
|
||||
|
||||
## Putting it together
|
||||
|
||||
With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`:
|
||||
|
||||
```
|
||||
class MyModule(nn.Module):
|
||||
def forward(self, x, y):
|
||||
# you can combine your ATen ops with standard onnx ones
|
||||
x = nn.ReLU()(x)
|
||||
return MyFunction.apply(x, y)
|
||||
|
||||
torch.onnx.export(MyModule(),
|
||||
(Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
|
||||
"output.onnx",
|
||||
verbose=True)
|
||||
```
|
||||
|
||||
This exports the following graph, which contains calls the `ATen` operator:
|
||||
|
||||
```
|
||||
graph(%1 : Float(3, 4)
|
||||
%2 : Float(3, 4)) {
|
||||
%3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
|
||||
%4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
|
||||
%5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
|
||||
return (%5);
|
||||
}
|
||||
```
|
||||
|
||||
The graph can then be imported using ONNX and run with Caffe2:
|
||||
|
||||
```
|
||||
import onnx
|
||||
import caffe2.python.onnx.backend
|
||||
import numpy as np
|
||||
|
||||
graph = onnx.load("output.onnx")
|
||||
|
||||
a = np.random.randn(3, 2).astype(np.float32)
|
||||
b = np.random.randn(3, 2).astype(np.float32)
|
||||
|
||||
prepared_backend = caffe2.python.onnx.backend.prepare(graph)
|
||||
W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
|
||||
c2_out = prepared_backend.run(W)[0]
|
||||
|
||||
x = np.maximum(a, 0)
|
||||
r = x*x + b
|
||||
np.testing.assert_array_almost_equal(r, c2_out)
|
||||
```
|
||||
|
||||
### Code
|
||||
|
||||
For the full source code for this tutorial, see [sample.py](sample.py).
|
54
caffe2/contrib/aten/docs/sample.py
Normal file
54
caffe2/contrib/aten/docs/sample.py
Normal file
@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
|
||||
from torch import nn
|
||||
from torch.autograd import Variable, Function
|
||||
import torch.onnx
|
||||
|
||||
import onnx
|
||||
import caffe2.python.onnx.backend
|
||||
|
||||
class MyFunction(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
return x*x + y
|
||||
@staticmethod
|
||||
def symbolic(graph, x, y):
|
||||
x2 = graph.at("mul", x, x)
|
||||
r = graph.at("add", x2, y)
|
||||
# x, y, x2, and r are 'Node' objects
|
||||
# print(r) or print(graph) will print out a textual representation for debugging.
|
||||
# this representation will be converted to ONNX protobufs on export.
|
||||
return r
|
||||
|
||||
class MyModule(nn.Module):
|
||||
def forward(self, x, y):
|
||||
# you can combine your ATen ops with standard onnx ones
|
||||
x = nn.ReLU()(x)
|
||||
return MyFunction.apply(x, y)
|
||||
|
||||
torch.onnx.export(MyModule(),
|
||||
(Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
|
||||
"output.onnx",
|
||||
verbose=True)
|
||||
|
||||
# prints the graph for debugging:
|
||||
# graph(%1 : Float(3, 4)
|
||||
# %2 : Float(3, 4)) {
|
||||
# %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
|
||||
# %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
|
||||
# %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
|
||||
# return (%5);
|
||||
# }
|
||||
|
||||
graph = onnx.load("output.onnx")
|
||||
|
||||
a = np.random.randn(3, 4).astype(np.float32)
|
||||
b = np.random.randn(3, 4).astype(np.float32)
|
||||
|
||||
prepared_backend = caffe2.python.onnx.backend.prepare(graph)
|
||||
W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
|
||||
c2_out = prepared_backend.run(W)[0]
|
||||
|
||||
x = np.maximum(a, 0)
|
||||
r = x*x + b
|
||||
np.testing.assert_array_almost_equal(r, c2_out)
|
274
caffe2/contrib/aten/gen_op.py
Executable file
274
caffe2/contrib/aten/gen_op.py
Executable file
@ -0,0 +1,274 @@
|
||||
#!/bin/env python
|
||||
|
||||
# Copyright (c) 2016-present, Facebook, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
##############################################################################
|
||||
|
||||
import sys
|
||||
import yaml
|
||||
import argparse
|
||||
import os
|
||||
from copy import deepcopy
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--template_dir", default=".", help="where template.h is")
|
||||
parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen",
|
||||
help="where ATen yaml files are")
|
||||
parser.add_argument("--output_prefix", default="", help="")
|
||||
parser.add_argument(
|
||||
"--install_dir", default=".", help="where to put generated file")
|
||||
parser.add_argument("--third_party_root", default="", help="caffe2 third_party")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if args.third_party_root:
|
||||
sys.path.append(os.path.join(args.third_party_root, "aten/src/ATen"))
|
||||
from code_template import CodeTemplate as CT
|
||||
else:
|
||||
from src.ATen.code_template import CodeTemplate as CT
|
||||
|
||||
OP_TEMPLATE = CT.from_file(
|
||||
os.path.join(args.template_dir, 'aten_op_template.h'))
|
||||
|
||||
|
||||
try:
|
||||
# use faster C loader if available
|
||||
from yaml import CLoader as Loader
|
||||
except ImportError:
|
||||
from yaml import Loader
|
||||
|
||||
|
||||
def write(filename, s):
|
||||
with open(filename, "w") as f:
|
||||
f.write(s)
|
||||
|
||||
|
||||
def read(filename):
|
||||
with open(filename, "r") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def value_has_tensors(v):
|
||||
# Sparse shouldn't appear in public API, seems to be temporary bug
|
||||
return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type']
|
||||
|
||||
|
||||
def value_is_tensor_type(v):
|
||||
return value_has_tensors(v) and v['dynamic_type'] != 'TensorList'
|
||||
|
||||
|
||||
# for each aten type, how do we handle a return value of that type?
|
||||
RETURN_MAP = {
|
||||
'Tensor': 'assignTo(Output(${offset}),${output});',
|
||||
'Scalar': 'assignTo(Output(${offset}),*inferred_type, ${output});',
|
||||
'bool': 'assignToValue<int64_t>(Output(${offset}),${output});',
|
||||
'int64_t': 'assignToValue<int64_t>(Output(${offset}),${output});',
|
||||
'std::vector<Tensor>': 'assignListStartingAt(${offset}, ${output});',
|
||||
}
|
||||
|
||||
# for each non-Tensor aten argument, how to we read it from caffe2's
|
||||
# attribute list. Most of these call runtime functions defined in the
|
||||
# template class.
|
||||
ARGUMENT_MAP = {
|
||||
'Scalar': 'at::Scalar ${arg} = readScalarAttribute("${arg}");',
|
||||
'bool': 'bool ${arg} = readAttribute<int64_t>("${arg}");',
|
||||
'int': 'int ${arg} = readAttribute<int64_t>("${arg}");',
|
||||
'double': 'double ${arg} = readAttribute<float>("${arg}");',
|
||||
'int64_t': 'int64_t ${arg} = readAttribute<int64_t>("${arg}");',
|
||||
'IntList': 'auto ${arg} = readIntList("${arg}");',
|
||||
'std::array<bool, 2>': 'auto ${arg} = readBoolMask<2>("${arg}");',
|
||||
'std::array<bool, 3>': 'auto ${arg} = readBoolMask<3>("${arg}");',
|
||||
}
|
||||
|
||||
|
||||
def expand(o):
|
||||
num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments'])
|
||||
results = [o]
|
||||
for i in range(0, num_defaults):
|
||||
# last num_default values should be default
|
||||
assert('default' in o['arguments'][-(i + 1)])
|
||||
v = deepcopy(o)
|
||||
v['arguments'] = v['arguments'][:-(i + 1)]
|
||||
results.append(v)
|
||||
return results
|
||||
|
||||
|
||||
# filter the list of declarations removing things we cannot support
|
||||
def supports(o):
|
||||
# skip all in-place operators for now since aten cannot Resize
|
||||
# caffe2 memory inside an operator
|
||||
if o['inplace']:
|
||||
return False
|
||||
|
||||
# _out variants also work in-place on arguments taken as destinations
|
||||
# we also cannot handle these because aten cannot resize caffe2 Tensors
|
||||
if "_out" in o['name']:
|
||||
return False
|
||||
|
||||
# skip return types we cannot handle
|
||||
for ret in o['returns']:
|
||||
if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP:
|
||||
print("Skipping {} Because of Ret: {} ({})".format(
|
||||
o['name'], ret['type'], ret['dynamic_type']))
|
||||
return False
|
||||
|
||||
# skip arguments we cannot handle
|
||||
for arg in o['arguments']:
|
||||
if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP:
|
||||
print("Skipping {} Because of Arg: {} ({}) ".format(
|
||||
o['name'], arg['type'], arg['dynamic_type']))
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# template for each potential operator.
|
||||
# each operator has an integer 'key' associated with it, and
|
||||
# a lambda that defines the operator
|
||||
# non-tensor attributes are created in ${initialization}
|
||||
# and then saved as arguments to the lambda
|
||||
# Inputs/Outputs are read inside the lambda
|
||||
OPTION_TEMPLATE = CT("""\
|
||||
case ${key}: { // ${name}
|
||||
${initialization}
|
||||
run_op = [=] {
|
||||
${statements}
|
||||
auto the_result = ${invocation};
|
||||
${assignments}
|
||||
return true;
|
||||
};
|
||||
} break;
|
||||
""")
|
||||
|
||||
|
||||
def get_output(o, i):
|
||||
if len(o['returns']) == 1:
|
||||
return 'the_result'
|
||||
else:
|
||||
return 'std::get<{}>(the_result)'.format(i)
|
||||
|
||||
|
||||
def attribute_names(o):
|
||||
return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)])
|
||||
|
||||
|
||||
def required_attribute_names(o):
|
||||
return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a])
|
||||
|
||||
|
||||
def self_as_first_argument(arguments):
|
||||
return ([a for a in arguments if a['name'] == 'self'] +
|
||||
[a for a in arguments if a['name'] != 'self'])
|
||||
|
||||
|
||||
def get_num_inputs(o):
|
||||
args = 0
|
||||
for a in o['arguments']:
|
||||
if a['type'] == 'TensorList':
|
||||
return '*'
|
||||
elif value_has_tensors(a):
|
||||
args += 1
|
||||
return str(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
|
||||
filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded)]
|
||||
top_env = {
|
||||
'mappings': [],
|
||||
'implementations': [],
|
||||
}
|
||||
seen = set()
|
||||
key = 0
|
||||
for o in filtered:
|
||||
# [DESCRIPTORS]
|
||||
# each option is associated with a descriptor string that is used
|
||||
# to figure out which version of an op is being used:
|
||||
# The format is:
|
||||
# opname-num_inputs-attribute_1-attribute2
|
||||
# Example:
|
||||
# lerp-2-weight
|
||||
# the operator lerp takes 2 arguments and has the attribute weight
|
||||
attr_names = attribute_names(o)
|
||||
num_inputs = get_num_inputs(o)
|
||||
descriptor = '-'.join([o['name']] + attr_names + [num_inputs])
|
||||
if descriptor in seen:
|
||||
continue
|
||||
seen.add(descriptor)
|
||||
|
||||
# map from descriptor string to the integer key in the switch statements
|
||||
# that initializes the operators
|
||||
top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key))
|
||||
env = {
|
||||
'name': o['name'],
|
||||
'statements': [],
|
||||
'arguments': [],
|
||||
'assignments': [],
|
||||
'initialization': [],
|
||||
'key': str(key),
|
||||
}
|
||||
defined_inferred_type = False
|
||||
|
||||
if 'Tensor' in o['method_of']:
|
||||
# make sure 'self' is the first argument. currently Declarations.yaml
|
||||
# does not always do this. Instead it keeps the argument list the same order
|
||||
# as the Type method.
|
||||
o['arguments'] = self_as_first_argument(o['arguments'])
|
||||
elif 'namespace' not in o['method_of']:
|
||||
# methods on type like 'ones' or 'zeros' always take a
|
||||
# string attribute that is translated into the at::Type object
|
||||
# e.g. "Float" is at::kFloat
|
||||
assert('Type' in o['method_of'])
|
||||
defined_inferred_type = True
|
||||
env['initialization'].append(
|
||||
'auto inferred_type = readTypeAttribute("type");')
|
||||
|
||||
i = 0
|
||||
for arg in o['arguments']:
|
||||
env['arguments'].append(arg['name'])
|
||||
if arg['type'] == 'TensorList':
|
||||
env['statements'].append(
|
||||
'auto {} = loadInputsAtOffset({});'.format(arg['name'], i))
|
||||
elif value_is_tensor_type(arg):
|
||||
assert(i != '*') # tensor list is not last argument
|
||||
# load tensor inputs from Caffe2
|
||||
env['statements'].append(
|
||||
"auto {} = loadInput({});".format(arg['name'], i))
|
||||
i += 1
|
||||
if arg['dynamic_type'] == 'Tensor' and not defined_inferred_type:
|
||||
# first tensor input is used to define the output type.
|
||||
defined_inferred_type = True
|
||||
env['statements'].append(
|
||||
'auto inferred_type = &({}.type());'.format(
|
||||
arg['name']))
|
||||
else:
|
||||
init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
|
||||
env['initialization'].append(init)
|
||||
|
||||
for i, r in enumerate(o['returns']):
|
||||
t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'Tensor']
|
||||
assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
|
||||
env['assignments'].append(assignment)
|
||||
|
||||
if 'Tensor' in o['method_of']:
|
||||
env['invocation'] = "self.{}({})".format(
|
||||
o['name'], ', '.join(env['arguments'][1:]))
|
||||
elif 'namespace' in o['method_of']:
|
||||
env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
|
||||
else:
|
||||
assert('Type' in o['method_of'])
|
||||
env['invocation'] = CT(
|
||||
'inferred_type->${name}(${arguments})').substitute(env)
|
||||
|
||||
top_env['implementations'].append(OPTION_TEMPLATE.substitute(env))
|
||||
key += 1
|
||||
write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env))
|
202
caffe2/contrib/cuda-convnet2/LICENSE
Normal file
202
caffe2/contrib/cuda-convnet2/LICENSE
Normal file
@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
7
caffe2/contrib/cuda-convnet2/README.md
Normal file
7
caffe2/contrib/cuda-convnet2/README.md
Normal file
@ -0,0 +1,7 @@
|
||||
# cuda-convnet2
|
||||
Automatically exported from code.google.com/p/cuda-convnet2
|
||||
|
||||
You can read the documentation in two ways:
|
||||
|
||||
1. On this site: go to branches > wiki.
|
||||
2. On Google Code (for now?): https://code.google.com/p/cuda-convnet2/
|
50
caffe2/contrib/cuda-convnet2/build.sh
Executable file
50
caffe2/contrib/cuda-convnet2/build.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/sh
|
||||
# Copyright 2014 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###############################################################################
|
||||
|
||||
# Fill in the below environment variables.
|
||||
#
|
||||
# If you're not sure what these paths should be,
|
||||
# you can use the find command to try to locate them.
|
||||
# For example, NUMPY_INCLUDE_PATH contains the file
|
||||
# arrayobject.h. So you can search for it like this:
|
||||
#
|
||||
# find /usr -name arrayobject.h
|
||||
#
|
||||
# (it'll almost certainly be under /usr)
|
||||
|
||||
# CUDA toolkit installation directory.
|
||||
export CUDA_INSTALL_PATH=/usr/local/cuda
|
||||
|
||||
# Python include directory. This should contain the file Python.h, among others.
|
||||
export PYTHON_INCLUDE_PATH=/usr/include/python2.7
|
||||
|
||||
# Numpy include directory. This should contain the file arrayobject.h, among others.
|
||||
export NUMPY_INCLUDE_PATH=/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/
|
||||
|
||||
# ATLAS library directory. This should contain the file libcblas.so, among others.
|
||||
export ATLAS_LIB_PATH=/usr/lib/atlas-base
|
||||
|
||||
# You don't have to change these:
|
||||
export LD_LIBRARY_PATH=$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH
|
||||
export CUDA_SDK_PATH=$CUDA_INSTALL_PATH/samples
|
||||
export PATH=$PATH:$CUDA_INSTALL_PATH/bin
|
||||
|
||||
cd util && make numpy=1 -j $* && cd ..
|
||||
cd nvmatrix && make -j $* && cd ..
|
||||
cd cudaconv3 && make -j $* && cd ..
|
||||
cd cudaconvnet && make -j $* && cd ..
|
||||
cd make-data/pyext && make -j $* && cd ../..
|
||||
|
291
caffe2/contrib/cuda-convnet2/convdata.py
Normal file
291
caffe2/contrib/cuda-convnet2/convdata.py
Normal file
@ -0,0 +1,291 @@
|
||||
# Copyright 2014 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from python_util.data import *
|
||||
import numpy.random as nr
|
||||
import numpy as n
|
||||
import random as r
|
||||
from time import time
|
||||
from threading import Thread
|
||||
from math import sqrt
|
||||
import sys
|
||||
#from matplotlib import pylab as pl
|
||||
from PIL import Image
|
||||
from StringIO import StringIO
|
||||
from time import time
|
||||
import itertools as it
|
||||
|
||||
class JPEGBatchLoaderThread(Thread):
|
||||
def __init__(self, dp, batch_num, label_offset, list_out):
|
||||
Thread.__init__(self)
|
||||
self.list_out = list_out
|
||||
self.label_offset = label_offset
|
||||
self.dp = dp
|
||||
self.batch_num = batch_num
|
||||
|
||||
@staticmethod
|
||||
def load_jpeg_batch(rawdics, dp, label_offset):
|
||||
if type(rawdics) != list:
|
||||
rawdics = [rawdics]
|
||||
nc_total = sum(len(r['data']) for r in rawdics)
|
||||
|
||||
jpeg_strs = list(it.chain.from_iterable(rd['data'] for rd in rawdics))
|
||||
labels = list(it.chain.from_iterable(rd['labels'] for rd in rawdics))
|
||||
|
||||
img_mat = n.empty((nc_total * dp.data_mult, dp.inner_pixels * dp.num_colors), dtype=n.float32)
|
||||
lab_mat = n.zeros((nc_total, dp.get_num_classes()), dtype=n.float32)
|
||||
dp.convnet.libmodel.decodeJpeg(jpeg_strs, img_mat, dp.img_size, dp.inner_size, dp.test, dp.multiview)
|
||||
lab_vec = n.tile(n.asarray([(l[nr.randint(len(l))] if len(l) > 0 else -1) + label_offset for l in labels], dtype=n.single).reshape((nc_total, 1)), (dp.data_mult,1))
|
||||
for c in xrange(nc_total):
|
||||
lab_mat[c, [z + label_offset for z in labels[c]]] = 1
|
||||
lab_mat = n.tile(lab_mat, (dp.data_mult, 1))
|
||||
|
||||
|
||||
return {'data': img_mat[:nc_total * dp.data_mult,:],
|
||||
'labvec': lab_vec[:nc_total * dp.data_mult,:],
|
||||
'labmat': lab_mat[:nc_total * dp.data_mult,:]}
|
||||
|
||||
def run(self):
|
||||
rawdics = self.dp.get_batch(self.batch_num)
|
||||
p = JPEGBatchLoaderThread.load_jpeg_batch(rawdics,
|
||||
self.dp,
|
||||
self.label_offset)
|
||||
self.list_out.append(p)
|
||||
|
||||
class ColorNoiseMakerThread(Thread):
|
||||
def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
|
||||
Thread.__init__(self)
|
||||
self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
|
||||
self.num_noise = num_noise
|
||||
self.list_out = list_out
|
||||
|
||||
def run(self):
|
||||
noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T)
|
||||
self.list_out.append(noise)
|
||||
|
||||
class ImageDataProvider(LabeledDataProvider):
|
||||
def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
|
||||
LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
|
||||
self.data_mean = self.batch_meta['data_mean'].astype(n.single)
|
||||
self.color_eig = self.batch_meta['color_pca'][1].astype(n.single)
|
||||
self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)]
|
||||
self.color_noise_coeff = dp_params['color_noise']
|
||||
self.num_colors = 3
|
||||
self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
|
||||
self.mini = dp_params['minibatch_size']
|
||||
self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.img_size
|
||||
self.inner_pixels = self.inner_size **2
|
||||
self.border_size = (self.img_size - self.inner_size) / 2
|
||||
self.multiview = dp_params['multiview_test'] and test
|
||||
self.num_views = 5*2
|
||||
self.data_mult = self.num_views if self.multiview else 1
|
||||
self.batch_size = self.batch_meta['batch_size']
|
||||
self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset']
|
||||
self.scalar_mean = dp_params['scalar_mean']
|
||||
# Maintain pointers to previously-returned data matrices so they don't get garbage collected.
|
||||
self.data = [None, None] # These are pointers to previously-returned data matrices
|
||||
|
||||
self.loader_thread, self.color_noise_thread = None, None
|
||||
self.convnet = dp_params['convnet']
|
||||
|
||||
self.num_noise = self.batch_size
|
||||
self.batches_generated, self.loaders_started = 0, 0
|
||||
self.data_mean_crop = self.data_mean.reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2))
|
||||
|
||||
if self.scalar_mean >= 0:
|
||||
self.data_mean_crop = self.scalar_mean
|
||||
|
||||
def showimg(self, img):
|
||||
from matplotlib import pylab as pl
|
||||
pixels = img.shape[0] / 3
|
||||
size = int(sqrt(pixels))
|
||||
img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1)
|
||||
pl.imshow(img, interpolation='nearest')
|
||||
pl.show()
|
||||
|
||||
def get_data_dims(self, idx=0):
|
||||
if idx == 0:
|
||||
return self.inner_size**2 * 3
|
||||
if idx == 2:
|
||||
return self.get_num_classes()
|
||||
return 1
|
||||
|
||||
def start_loader(self, batch_idx):
|
||||
self.load_data = []
|
||||
self.loader_thread = JPEGBatchLoaderThread(self,
|
||||
self.batch_range[batch_idx],
|
||||
self.label_offset,
|
||||
self.load_data)
|
||||
self.loader_thread.start()
|
||||
|
||||
def start_color_noise_maker(self):
|
||||
color_noise_list = []
|
||||
self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
|
||||
self.color_noise_thread.start()
|
||||
return color_noise_list
|
||||
|
||||
def set_labels(self, datadic):
|
||||
pass
|
||||
|
||||
def get_data_from_loader(self):
|
||||
if self.loader_thread is None:
|
||||
self.start_loader(self.batch_idx)
|
||||
self.loader_thread.join()
|
||||
self.data[self.d_idx] = self.load_data[0]
|
||||
|
||||
self.start_loader(self.get_next_batch_idx())
|
||||
else:
|
||||
# Set the argument to join to 0 to re-enable batch reuse
|
||||
self.loader_thread.join()
|
||||
if not self.loader_thread.is_alive():
|
||||
self.data[self.d_idx] = self.load_data[0]
|
||||
self.start_loader(self.get_next_batch_idx())
|
||||
#else:
|
||||
# print "Re-using batch"
|
||||
self.advance_batch()
|
||||
|
||||
def add_color_noise(self):
|
||||
# At this point the data already has 0 mean.
|
||||
# So I'm going to add noise to it, but I'm also going to scale down
|
||||
# the original data. This is so that the overall scale of the training
|
||||
# data doesn't become too different from the test data.
|
||||
|
||||
s = self.data[self.d_idx]['data'].shape
|
||||
cropped_size = self.get_data_dims(0) / 3
|
||||
ncases = s[0]
|
||||
|
||||
if self.color_noise_thread is None:
|
||||
self.color_noise_list = self.start_color_noise_maker()
|
||||
self.color_noise_thread.join()
|
||||
self.color_noise = self.color_noise_list[0]
|
||||
self.color_noise_list = self.start_color_noise_maker()
|
||||
else:
|
||||
self.color_noise_thread.join(0)
|
||||
if not self.color_noise_thread.is_alive():
|
||||
self.color_noise = self.color_noise_list[0]
|
||||
self.color_noise_list = self.start_color_noise_maker()
|
||||
|
||||
self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases*3, cropped_size))
|
||||
self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1))
|
||||
self.data[self.d_idx]['data'] += self.color_noise * self.color_noise_coeff
|
||||
self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases, 3* cropped_size))
|
||||
self.data[self.d_idx]['data'] *= 1.0 / (1.0 + self.color_noise_coeff) # <--- NOTE: This is the slow line, 0.25sec. Down from 0.75sec when I used division.
|
||||
|
||||
def get_next_batch(self):
|
||||
self.d_idx = self.batches_generated % 2
|
||||
epoch, batchnum = self.curr_epoch, self.curr_batchnum
|
||||
|
||||
self.get_data_from_loader()
|
||||
|
||||
# Subtract mean
|
||||
self.data[self.d_idx]['data'] -= self.data_mean_crop
|
||||
|
||||
if self.color_noise_coeff > 0 and not self.test:
|
||||
self.add_color_noise()
|
||||
self.batches_generated += 1
|
||||
|
||||
return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labvec'].T, self.data[self.d_idx]['labmat'].T]
|
||||
|
||||
|
||||
# Takes as input an array returned by get_next_batch
|
||||
# Returns a (numCases, imgSize, imgSize, 3) array which can be
|
||||
# fed to pylab for plotting.
|
||||
# This is used by shownet.py to plot test case predictions.
|
||||
def get_plottable_data(self, data, add_mean=True):
|
||||
mean = self.data_mean_crop.reshape((data.shape[0],1)) if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.reshape((data.shape[0],1))
|
||||
return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
|
||||
|
||||
class CIFARDataProvider(LabeledDataProvider):
|
||||
def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
|
||||
LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
|
||||
self.img_size = 32
|
||||
self.num_colors = 3
|
||||
self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.batch_meta['img_size']
|
||||
self.border_size = (self.img_size - self.inner_size) / 2
|
||||
self.multiview = dp_params['multiview_test'] and test
|
||||
self.num_views = 9
|
||||
self.scalar_mean = dp_params['scalar_mean']
|
||||
self.data_mult = self.num_views if self.multiview else 1
|
||||
self.data_dic = []
|
||||
for i in batch_range:
|
||||
self.data_dic += [unpickle(self.get_data_file_name(i))]
|
||||
self.data_dic[-1]["labels"] = n.require(self.data_dic[-1]['labels'], dtype=n.single)
|
||||
self.data_dic[-1]["labels"] = n.require(n.tile(self.data_dic[-1]["labels"].reshape((1, n.prod(self.data_dic[-1]["labels"].shape))), (1, self.data_mult)), requirements='C')
|
||||
self.data_dic[-1]['data'] = n.require(self.data_dic[-1]['data'] - self.scalar_mean, dtype=n.single, requirements='C')
|
||||
|
||||
self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)]
|
||||
|
||||
self.batches_generated = 0
|
||||
self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1))
|
||||
|
||||
def get_next_batch(self):
|
||||
epoch, batchnum = self.curr_epoch, self.curr_batchnum
|
||||
self.advance_batch()
|
||||
bidx = batchnum - self.batch_range[0]
|
||||
|
||||
cropped = self.cropped_data[self.batches_generated % 2]
|
||||
|
||||
self.__trim_borders(self.data_dic[bidx]['data'], cropped)
|
||||
cropped -= self.data_mean
|
||||
self.batches_generated += 1
|
||||
return epoch, batchnum, [cropped, self.data_dic[bidx]['labels']]
|
||||
|
||||
def get_data_dims(self, idx=0):
|
||||
return self.inner_size**2 * self.num_colors if idx == 0 else 1
|
||||
|
||||
# Takes as input an array returned by get_next_batch
|
||||
# Returns a (numCases, imgSize, imgSize, 3) array which can be
|
||||
# fed to pylab for plotting.
|
||||
# This is used by shownet.py to plot test case predictions.
|
||||
def get_plottable_data(self, data):
|
||||
return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
|
||||
|
||||
def __trim_borders(self, x, target):
|
||||
y = x.reshape(self.num_colors, self.img_size, self.img_size, x.shape[1])
|
||||
|
||||
if self.test: # don't need to loop over cases
|
||||
if self.multiview:
|
||||
start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2),
|
||||
(self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2),
|
||||
(self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)]
|
||||
end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
|
||||
for i in xrange(self.num_views):
|
||||
target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1]))
|
||||
else:
|
||||
pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now
|
||||
target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1]))
|
||||
else:
|
||||
for c in xrange(x.shape[1]): # loop over cases
|
||||
startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
|
||||
endY, endX = startY + self.inner_size, startX + self.inner_size
|
||||
pic = y[:,startY:endY,startX:endX, c]
|
||||
if nr.randint(2) == 0: # also flip the image with 50% probability
|
||||
pic = pic[:,:,::-1]
|
||||
target[:,c] = pic.reshape((self.get_data_dims(),))
|
||||
|
||||
class DummyConvNetLogRegDataProvider(LabeledDummyDataProvider):
|
||||
def __init__(self, data_dim):
|
||||
LabeledDummyDataProvider.__init__(self, data_dim)
|
||||
|
||||
self.img_size = int(sqrt(data_dim/3))
|
||||
|
||||
def get_next_batch(self):
|
||||
epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
|
||||
dic = {'data': dic[0], 'labels': dic[1]}
|
||||
print dic['data'].shape, dic['labels'].shape
|
||||
return epoch, batchnum, [dic['data'], dic['labels']]
|
||||
|
||||
# Returns the dimensionality of the two data matrices returned by get_next_batch
|
||||
def get_data_dims(self, idx=0):
|
||||
return self.batch_meta['num_vis'] if idx == 0 else 1
|
289
caffe2/contrib/cuda-convnet2/convnet.py
Normal file
289
caffe2/contrib/cuda-convnet2/convnet.py
Normal file
@ -0,0 +1,289 @@
|
||||
# Copyright 2014 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as n
|
||||
import numpy.random as nr
|
||||
import random as r
|
||||
from python_util.util import *
|
||||
from python_util.data import *
|
||||
from python_util.options import *
|
||||
from python_util.gpumodel import *
|
||||
import sys
|
||||
import math as m
|
||||
import layer as lay
|
||||
from convdata import ImageDataProvider, CIFARDataProvider, DummyConvNetLogRegDataProvider
|
||||
from os import linesep as NL
|
||||
import copy as cp
|
||||
import os
|
||||
|
||||
class Driver(object):
|
||||
def __init__(self, convnet):
|
||||
self.convnet = convnet
|
||||
|
||||
def on_start_batch(self, batch_data, train):
|
||||
pass
|
||||
|
||||
def on_finish_batch(self):
|
||||
pass
|
||||
|
||||
class GradCheckDriver(Driver):
|
||||
def on_start_batch(self, batch_data, train):
|
||||
data = batch_data[2]
|
||||
self.convnet.libmodel.checkGradients(data)
|
||||
|
||||
class TrainingDriver(Driver):
|
||||
def on_start_batch(self, batch_data, train):
|
||||
data = batch_data[2]
|
||||
self.convnet.libmodel.startBatch(data, self.convnet.get_progress(), not train)
|
||||
|
||||
class MultiviewTestDriver(TrainingDriver):
|
||||
def on_start_batch(self, batch_data, train):
|
||||
self.write_output = False
|
||||
if train:
|
||||
TrainingDriver.on_start_batch(self, batch_data, train)
|
||||
else:
|
||||
data = batch_data[2]
|
||||
num_views = self.convnet.test_data_provider.num_views
|
||||
if self.convnet.test_out != "" and self.convnet.logreg_name != "":
|
||||
self.write_output = True
|
||||
self.test_file_name = os.path.join(self.convnet.test_out, 'test_preds_%d' % batch_data[1])
|
||||
self.probs = n.zeros((data[0].shape[1]/num_views, self.convnet.test_data_provider.get_num_classes()), dtype=n.single)
|
||||
self.convnet.libmodel.startMultiviewTest(data, num_views, self.probs, self.convnet.logreg_name)
|
||||
else:
|
||||
self.convnet.libmodel.startMultiviewTest(data, num_views)
|
||||
|
||||
def on_finish_batch(self):
|
||||
if self.write_output:
|
||||
if not os.path.exists(self.convnet.test_out):
|
||||
os.makedirs(self.convnet.test_out)
|
||||
pickle(self.test_file_name, {'data': self.probs,
|
||||
'note': 'generated from %s' % self.convnet.save_file})
|
||||
|
||||
class FeatureWriterDriver(Driver):
|
||||
def __init__(self, convnet):
|
||||
Driver.__init__(self, convnet)
|
||||
self.last_batch = convnet.test_batch_range[-1]
|
||||
|
||||
def on_start_batch(self, batch_data, train):
|
||||
if train:
|
||||
raise ModelStateException("FeatureWriter must be used in conjunction with --test-only=1. It writes test data features.")
|
||||
|
||||
self.batchnum, self.data = batch_data[1], batch_data[2]
|
||||
|
||||
if not os.path.exists(self.convnet.feature_path):
|
||||
os.makedirs(self.convnet.feature_path)
|
||||
|
||||
self.num_ftrs = self.convnet.layers[self.convnet.write_features]['outputs']
|
||||
self.ftrs = n.zeros((self.data[0].shape[1], self.num_ftrs), dtype=n.single)
|
||||
self.convnet.libmodel.startFeatureWriter(self.data, [self.ftrs], [self.convnet.write_features])
|
||||
|
||||
def on_finish_batch(self):
|
||||
path_out = os.path.join(self.convnet.feature_path, 'data_batch_%d' % self.batchnum)
|
||||
pickle(path_out, {'data': self.ftrs, 'labels': self.data[1]})
|
||||
print "Wrote feature file %s" % path_out
|
||||
if self.batchnum == self.last_batch:
|
||||
pickle(os.path.join(self.convnet.feature_path, 'batches.meta'), {'source_model':self.convnet.load_file,
|
||||
'num_vis':self.num_ftrs,
|
||||
'batch_size': self.convnet.test_data_provider.batch_meta['batch_size']})
|
||||
|
||||
class ConvNet(IGPUModel):
|
||||
def __init__(self, op, load_dic, dp_params={}):
|
||||
filename_options = []
|
||||
for v in ('color_noise', 'multiview_test', 'inner_size', 'scalar_mean', 'minibatch_size'):
|
||||
dp_params[v] = op.get_value(v)
|
||||
|
||||
IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params)
|
||||
|
||||
def import_model(self):
|
||||
lib_name = "cudaconvnet._ConvNet"
|
||||
print "========================="
|
||||
print "Importing %s C++ module" % lib_name
|
||||
self.libmodel = __import__(lib_name,fromlist=['_ConvNet'])
|
||||
|
||||
def init_model_lib(self):
|
||||
self.libmodel.initModel(self.layers,
|
||||
self.device_ids,
|
||||
self.minibatch_size,
|
||||
self.conserve_mem)
|
||||
|
||||
def init_model_state(self):
|
||||
ms = self.model_state
|
||||
layers = ms['layers'] if self.loaded_from_checkpoint else {}
|
||||
ms['layers'] = lay.LayerParser.parse_layers(os.path.join(self.layer_path, self.layer_def),
|
||||
os.path.join(self.layer_path, self.layer_params), self, layers=layers)
|
||||
|
||||
self.do_decouple_conv()
|
||||
self.do_unshare_weights()
|
||||
|
||||
self.op.set_value('conv_to_local', [], parse=False)
|
||||
self.op.set_value('unshare_weights', [], parse=False)
|
||||
|
||||
self.set_driver()
|
||||
|
||||
def do_decouple_conv(self):
|
||||
# Convert convolutional layers to local
|
||||
if len(self.op.get_value('conv_to_local')) > 0:
|
||||
for lname in self.op.get_value('conv_to_local'):
|
||||
if self.model_state['layers'][lname]['type'] == 'conv':
|
||||
lay.LocalLayerParser.conv_to_local(self.model_state['layers'], lname)
|
||||
|
||||
def do_unshare_weights(self):
|
||||
# Decouple weight matrices
|
||||
if len(self.op.get_value('unshare_weights')) > 0:
|
||||
for name_str in self.op.get_value('unshare_weights'):
|
||||
if name_str:
|
||||
name = lay.WeightLayerParser.get_layer_name(name_str)
|
||||
if name is not None:
|
||||
name, idx = name[0], name[1]
|
||||
if name not in self.model_state['layers']:
|
||||
raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name)
|
||||
layer = self.model_state['layers'][name]
|
||||
lay.WeightLayerParser.unshare_weights(layer, self.model_state['layers'], matrix_idx=idx)
|
||||
else:
|
||||
raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str)
|
||||
|
||||
def set_driver(self):
|
||||
if self.op.get_value('check_grads'):
|
||||
self.driver = GradCheckDriver(self)
|
||||
elif self.op.get_value('multiview_test'):
|
||||
self.driver = MultiviewTestDriver(self)
|
||||
elif self.op.get_value('write_features'):
|
||||
self.driver = FeatureWriterDriver(self)
|
||||
else:
|
||||
self.driver = TrainingDriver(self)
|
||||
|
||||
def fill_excused_options(self):
|
||||
if self.op.get_value('check_grads'):
|
||||
self.op.set_value('save_path', '')
|
||||
self.op.set_value('train_batch_range', '0')
|
||||
self.op.set_value('test_batch_range', '0')
|
||||
self.op.set_value('data_path', '')
|
||||
|
||||
# Make sure the data provider returned data in proper format
|
||||
def parse_batch_data(self, batch_data, train=True):
|
||||
if max(d.dtype != n.single for d in batch_data[2]):
|
||||
raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.")
|
||||
return batch_data
|
||||
|
||||
def start_batch(self, batch_data, train=True):
|
||||
self.driver.on_start_batch(batch_data, train)
|
||||
|
||||
def finish_batch(self):
|
||||
ret = IGPUModel.finish_batch(self)
|
||||
self.driver.on_finish_batch()
|
||||
return ret
|
||||
|
||||
def print_iteration(self):
|
||||
print "%d.%d (%.2f%%)..." % (self.epoch, self.batchnum, 100 * self.get_progress()),
|
||||
|
||||
def print_train_time(self, compute_time_py):
|
||||
print "(%.3f sec)" % (compute_time_py)
|
||||
|
||||
def print_costs(self, cost_outputs):
|
||||
costs, num_cases = cost_outputs[0], cost_outputs[1]
|
||||
children = set()
|
||||
for errname in costs:
|
||||
if sum(errname in self.layers[z]['children'] for z in costs) == 0:
|
||||
# print self.layers[errname]['children']
|
||||
for child in set(self.layers[errname]['children']) & set(costs.keys()):
|
||||
costs[errname] = [v + u for v, u in zip(costs[errname], costs[child])]
|
||||
children.add(child)
|
||||
|
||||
filtered_costs = eval(self.layers[errname]['outputFilter'])(costs[errname], num_cases)
|
||||
print "%s: " % errname,
|
||||
if 'outputFilterFormatter' not in self.layers[errname]:
|
||||
print ", ".join("%.6f" % v for v in filtered_costs),
|
||||
else:
|
||||
print eval(self.layers[errname]['outputFilterFormatter'])(self,filtered_costs),
|
||||
if m.isnan(filtered_costs[0]) or m.isinf(filtered_costs[0]):
|
||||
print "<- error nan or inf!"
|
||||
sys.exit(1)
|
||||
for c in children:
|
||||
del costs[c]
|
||||
|
||||
def print_train_results(self):
|
||||
self.print_costs(self.train_outputs[-1])
|
||||
|
||||
def print_test_status(self):
|
||||
pass
|
||||
|
||||
def print_test_results(self):
|
||||
print NL + "======================Test output======================"
|
||||
self.print_costs(self.test_outputs[-1])
|
||||
if not self.test_only:
|
||||
print NL + "----------------------Averages-------------------------"
|
||||
self.print_costs(self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):]))
|
||||
print NL + "-------------------------------------------------------",
|
||||
for name,val in sorted(self.layers.items(), key=lambda x: x[1]['id']): # This is kind of hacky but will do for now.
|
||||
l = self.layers[name]
|
||||
if 'weights' in l:
|
||||
wscales = [(l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))]
|
||||
print ""
|
||||
print NL.join("Layer '%s' weights[%d]: %e [%e] [%e]" % (s[0], s[1], s[2], s[3], s[3]/s[2] if s[2] > 0 else 0) for s in wscales),
|
||||
print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))),
|
||||
print ""
|
||||
|
||||
def conditional_save(self):
|
||||
self.save_state()
|
||||
|
||||
def aggregate_test_outputs(self, test_outputs):
|
||||
test_outputs = cp.deepcopy(test_outputs)
|
||||
num_cases = sum(t[1] for t in test_outputs)
|
||||
for i in xrange(1 ,len(test_outputs)):
|
||||
for k,v in test_outputs[i][0].items():
|
||||
for j in xrange(len(v)):
|
||||
test_outputs[0][0][k][j] += test_outputs[i][0][k][j]
|
||||
|
||||
return (test_outputs[0][0], num_cases)
|
||||
|
||||
@classmethod
|
||||
def get_options_parser(cls):
|
||||
op = IGPUModel.get_options_parser()
|
||||
op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128)
|
||||
op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=False)
|
||||
op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file")
|
||||
op.add_option("layer-path", "layer_path", StringOptionParser, "Layer file path prefix", default="")
|
||||
op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path', 'save_file_override', 'train_batch_range','test_batch_range'])
|
||||
op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0)
|
||||
op.add_option("inner-size", "inner_size", IntegerOptionParser, "Cropped DP: crop size (0 = don't crop)", default=0, set_once=True)
|
||||
op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[])
|
||||
op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[])
|
||||
op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0)
|
||||
op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0)
|
||||
op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test'])
|
||||
op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="")
|
||||
op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract this scalar from image (-1 = don't)", default=-1)
|
||||
|
||||
op.add_option("write-features", "write_features", StringOptionParser, "Write test data features from given layer", default="", requires=['feature-path'])
|
||||
op.add_option("feature-path", "feature_path", StringOptionParser, "Write test data features to this path (to be used with --write-features)", default="")
|
||||
|
||||
op.delete_option('max_test_err')
|
||||
op.options["testing_freq"].default = 57
|
||||
op.options["num_epochs"].default = 50000
|
||||
op.options['dp_type'].default = None
|
||||
|
||||
DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDataProvider)
|
||||
DataProvider.register_data_provider('image', 'JPEG-encoded image data provider', ImageDataProvider)
|
||||
DataProvider.register_data_provider('cifar', 'CIFAR-10 data provider', CIFARDataProvider)
|
||||
|
||||
return op
|
||||
|
||||
if __name__ == "__main__":
|
||||
# nr.seed(6)
|
||||
|
||||
op = ConvNet.get_options_parser()
|
||||
|
||||
op, load_dic = IGPUModel.parse_options(op)
|
||||
model = ConvNet(op, load_dic)
|
||||
model.start()
|
108
caffe2/contrib/cuda-convnet2/cudaconv3/Makefile
Normal file
108
caffe2/contrib/cuda-convnet2/cudaconv3/Makefile
Normal file
@ -0,0 +1,108 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# NOTICE TO USER:
|
||||
#
|
||||
# This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
# international Copyright laws.
|
||||
#
|
||||
# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
# OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
#
|
||||
# U.S. Government End Users. This source code is a "commercial item" as
|
||||
# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
# "commercial computer software" and "commercial computer software
|
||||
# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
# and is provided to the U.S. Government only as a commercial end item.
|
||||
# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
# source code with only those rights set forth herein.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit binaries and libraries
|
||||
CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include
|
||||
CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin
|
||||
CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64
|
||||
|
||||
# Common binaries
|
||||
NVCC = $(CUDA_BIN_PATH)/nvcc
|
||||
GCC = g++
|
||||
AR = ar
|
||||
|
||||
# CUDA code generation flags
|
||||
GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
|
||||
GENCODE_FLAGS := $(GENCODE_SM35)
|
||||
|
||||
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
|
||||
CCFLAGS := -m64
|
||||
NVCCFLAGS := -m64
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
CCFLAGS += -g
|
||||
NVCCFLAGS += -g -G
|
||||
DBG := debug
|
||||
else
|
||||
DBG := release
|
||||
NVCCFLAGS += -O3
|
||||
CCFLAGS += -O3
|
||||
endif
|
||||
|
||||
# Add profiler output
|
||||
ifeq ($(prof),1)
|
||||
NVCCFLAGS += --ptxas-options=-v
|
||||
endif
|
||||
|
||||
TARGETDIR := ./bin/$(DBG)
|
||||
OBJDIR := ./obj/$(DBG)
|
||||
|
||||
########## USER STUFF ###########
|
||||
LDFLAGS += -L../util -lutilpy -L../nvmatrix -lnvmatrix -lcublas
|
||||
INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include
|
||||
|
||||
CUFILES := $(shell find . -name "*.cu")
|
||||
CU_DEPS := $(shell find . -name "*.cuh")
|
||||
CCFILES := $(shell find . -name "*.cpp")
|
||||
C_DEPS := $(shell find . -name "*.h")
|
||||
|
||||
NVCCFLAGS += --compiler-options '-fPIC'
|
||||
LDFLAGS += -shared
|
||||
CCFLAGS += -fPIC
|
||||
TARGET := $(TARGETDIR)/libcudaconv.so
|
||||
|
||||
################################################################################
|
||||
# Set up target and object files
|
||||
################################################################################
|
||||
OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
|
||||
OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
|
||||
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
|
||||
|
||||
# Target rules
|
||||
all: makedirs $(TARGET)
|
||||
|
||||
$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
|
||||
$(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
|
||||
|
||||
$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
|
||||
$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
|
||||
|
||||
$(TARGET): $(OBJS)
|
||||
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS)
|
||||
ln -sf $(TARGET) .
|
||||
|
||||
makedirs:
|
||||
mkdir -p $(TARGETDIR)
|
||||
mkdir -p $(OBJDIR)/src
|
||||
|
||||
clean:
|
||||
rm -rf ./obj
|
648
caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh
Normal file
648
caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh
Normal file
@ -0,0 +1,648 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef CONV_UTIL_CUH
|
||||
#define CONV_UTIL_CUH
|
||||
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) > (b) ? (b) : (a))
|
||||
#endif
|
||||
#ifndef MAX
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
|
||||
int subsX, int startX, int strideX, int outputsX);
|
||||
void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
|
||||
int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum);
|
||||
|
||||
void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
|
||||
int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum,
|
||||
float scaleTargets, float scaleOutput);
|
||||
void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
|
||||
int subsX, int startX, int strideX, int outputsX, float scaleTargets, float scaleOutput);
|
||||
|
||||
void convResponseNorm(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv);
|
||||
void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
|
||||
int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput);
|
||||
void convContrastNorm(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv);
|
||||
void convContrastNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& meanDiffs, NVMatrix& acts, NVMatrix& target, int numFilters,
|
||||
int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput);
|
||||
|
||||
void convGaussianBlur(NVMatrix& images, NVMatrix& filter, NVMatrix& target, bool horiz, int numChannels,
|
||||
float scaleTargets, float scaleOutputs);
|
||||
void convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX,
|
||||
int strideX, float scaleTargets, float scaleOutput);
|
||||
void convBedOfNailsUndo(NVMatrix& actsGrad, NVMatrix& target, int numChannels, int imgSize,
|
||||
int startX, int strideX, float scaleTargets, float scaleOutput);
|
||||
|
||||
void convResizeBilinear(NVMatrix& images, NVMatrix& target, int imgSize, int tgtSize, float scale);
|
||||
void convRGBToYUV(NVMatrix& images, NVMatrix& target);
|
||||
void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center);
|
||||
void convCrop(NVMatrix& imgs, NVMatrix& target, int imgSize, int tgtSize, int startY, int startX);
|
||||
void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm);
|
||||
void convContrastNormCrossMap(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& target,
|
||||
int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked);
|
||||
void convResponseNormCrossMapUndo(NVMatrix& outGrads, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
|
||||
int sizeF, float addScale, float powScale, float minDiv, bool blocked, float scaleTargets, float scaleOutput);
|
||||
void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale,
|
||||
float powScale, bool blocked);
|
||||
void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale,
|
||||
float powScale, float minDiv, bool blocked);
|
||||
void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize);
|
||||
|
||||
void convCrossMapMaxPoolUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
|
||||
const int imgSize, const int startF, const int poolSize,
|
||||
const int stride, const float scaleTargets, const float scaleOutputs);
|
||||
|
||||
cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor);
|
||||
|
||||
template<bool sum>
|
||||
class AvgPooler {
|
||||
public:
|
||||
__device__ inline float operator()(const float a, const float b) const {
|
||||
return a + b;
|
||||
}
|
||||
__device__ inline float getBaseValue() const {
|
||||
return 0;
|
||||
}
|
||||
__device__ inline float output(const float a, const int regionSize) const {
|
||||
return sum ? a : (a / regionSize);
|
||||
}
|
||||
};
|
||||
|
||||
class MaxPooler {
|
||||
public:
|
||||
__device__ inline float operator()(const float a, const float b) const {
|
||||
return fmaxf(a, b);
|
||||
}
|
||||
__device__ inline float getBaseValue() const {
|
||||
return -2e38;
|
||||
}
|
||||
__device__ inline float output(const float a, const int regionSize) const {
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
class MaxAbsPooler {
|
||||
public:
|
||||
__device__ inline float operator()(const float a, const float b) const {
|
||||
return fabsf(a) > fabsf(b) ? a : b;
|
||||
}
|
||||
__device__ inline float getBaseValue() const {
|
||||
return 0.0f;
|
||||
}
|
||||
__device__ inline float output(const float a, const int regionSize) const {
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* Block size B_YxB_X
|
||||
* blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
|
||||
* blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
|
||||
*
|
||||
* So each block does one output for some number of images/filters.
|
||||
*
|
||||
* threadIdx.x determines img idx
|
||||
* threadIdx.y determines filter idx
|
||||
*
|
||||
* imgs: (numFilters, imgPixels, numImages)
|
||||
* target: (numFilters, numOutputs, numImages)
|
||||
*
|
||||
* numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
|
||||
*/
|
||||
|
||||
template<class Agg, int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
|
||||
__global__ void kLocalPool(float* imgs, float* target, const int imgSize, const int numFilters,
|
||||
const int numImages, const int subsX, const int startX, const int strideX,
|
||||
const int outputsX, Agg agg) {
|
||||
const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
|
||||
const int numFilterBlocks = DIVUP(numFilters, B_Y*filtersPerThread);
|
||||
const int outputIdxX = blockIdx.x / numImgBlocks;
|
||||
const int outputIdxY = blockIdx.y / numFilterBlocks;
|
||||
const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
|
||||
const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
|
||||
const int myFilterIdx = (blockFilterIdx + threadIdx.y*filtersPerThread);
|
||||
if (myFilterIdx >= numFilters) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int outputIdx = outputIdxY * outputsX + outputIdxX;
|
||||
const int numOutputs = outputsX * outputsX;
|
||||
const int imgPixels = imgSize * imgSize;
|
||||
|
||||
const int startImgPxX = startX + outputIdxX * strideX;
|
||||
const int startImgPxY = startX + outputIdxY * strideX;
|
||||
const int imgIdx = blockImgIdx + threadIdx.x;
|
||||
|
||||
imgs += myFilterIdx * imgPixels * numImages + imgIdx;
|
||||
target += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
|
||||
|
||||
float prod[filtersPerThread][imgsPerThread];
|
||||
#pragma unroll
|
||||
for (int f = 0; f < filtersPerThread; f++) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
prod[f][i] = agg.getBaseValue();
|
||||
}
|
||||
}
|
||||
|
||||
const int loopStartY = MAX(0, startImgPxY);
|
||||
const int loopStartX = MAX(0, startImgPxX);
|
||||
const int loopEndY = MIN(imgSize, startImgPxY + subsX);
|
||||
const int loopEndX = MIN(imgSize, startImgPxX + subsX);
|
||||
const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX);
|
||||
for (int y = loopStartY; y < loopEndY; y++) {
|
||||
for (int x = loopStartX; x < loopEndX; x++) {
|
||||
const int imgPx = y * imgSize + x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
|
||||
#pragma unroll
|
||||
for (int f = 0; f < filtersPerThread; f++) {
|
||||
prod[f][i] = agg(prod[f][i], imgs[(f * imgPixels + imgPx) * numImages + i * B_X]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
|
||||
#pragma unroll
|
||||
for (int f = 0; f < filtersPerThread; f++) {
|
||||
target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Block size B_YxB_X
|
||||
* blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
|
||||
* blockIdx.y determines pixel.y, output idx in batches of B_Y
|
||||
*
|
||||
* So each block does one pixel for some number of images/filters.
|
||||
*
|
||||
* threadIdx.x determines img idx
|
||||
* threadIdx.y determines output idx
|
||||
*
|
||||
* imgs: (numFilters, imgPixels, numImages)
|
||||
* target: (numOutputs, imgPixels, numImages) (out)
|
||||
*
|
||||
* numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
|
||||
*/
|
||||
template<class Agg, int B_Y, int B_X, int imgsPerThread, bool checkCaseBounds>
|
||||
__global__ void kPoolCrossMap(float* imgs, float* target, const int imgSize,
|
||||
const int numFilters, const int numImages, const int startF, const int poolSize,
|
||||
const int numOutputs, const int stride, Agg agg) {
|
||||
const int imgPixels = imgSize * imgSize;
|
||||
const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
|
||||
// const int numOutputs = DIVUP(numFilters, stride);
|
||||
const int numOutputBlocks = DIVUP(numOutputs,B_Y);
|
||||
const int pxIdxX = blockIdx.x / numImgBlocks;
|
||||
const int pxIdxY = blockIdx.y / numOutputBlocks;
|
||||
const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
|
||||
const int outputIdx = (blockIdx.y % numOutputBlocks) * B_Y + threadIdx.y;
|
||||
// const int filterIdx = outputIdx * stride;
|
||||
|
||||
const int pxIdx = pxIdxY * imgSize + pxIdxX;
|
||||
const int imgIdx = blockImgIdx + threadIdx.x;
|
||||
|
||||
if (outputIdx < numOutputs) {
|
||||
imgs += (pxIdx) * numImages + imgIdx;
|
||||
target += (outputIdx * imgPixels + pxIdx) * numImages + imgIdx;
|
||||
|
||||
float prod[imgsPerThread];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
|
||||
prod[i] = agg.getBaseValue();
|
||||
}
|
||||
}
|
||||
|
||||
const int myStartF = startF + outputIdx * stride;
|
||||
const int loopStartF = max(0, myStartF);
|
||||
const int loopEndF = min(numFilters, myStartF + poolSize);
|
||||
|
||||
for (int f = loopStartF; f < loopEndF; ++f) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
|
||||
prod[i] = agg(prod[i], imgs[f * imgPixels * numImages + i * B_X]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
|
||||
target[i * B_X] = agg.output(prod[i], poolSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* imgs: (numFilters, imgPixels, numImages)
|
||||
* target: (numOutputs, imgPixels, numImages)
|
||||
*/
|
||||
template<class Pooler>
|
||||
void convPoolCrossMap(NVMatrix& images, NVMatrix& target, const int startF, const int poolSize,
|
||||
const int numOutputs, const int stride, const int imgSize, Pooler pooler) {
|
||||
int numImages = images.getNumCols();
|
||||
int imgPixels = imgSize * imgSize;
|
||||
int numFilters = images.getNumRows() / imgPixels;
|
||||
assert(images.getNumRows() == numFilters * imgPixels);
|
||||
|
||||
assert(!images.isTrans());
|
||||
assert(!target.isTrans());
|
||||
assert(images.isContiguous());
|
||||
// assert(numFilters % 4 == 0);
|
||||
// assert(numImages % 128 == 0);
|
||||
assert(stride <= poolSize);
|
||||
assert(startF <= 0);
|
||||
assert(startF + (numOutputs-1) * stride + poolSize >= numFilters); // All filters must be covered
|
||||
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
target.resize(imgPixels*numOutputs, numImages);
|
||||
int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
|
||||
|
||||
dim3 threads(32, 4);
|
||||
dim3 blocks(imgSize * DIVUP(numImages, threads.x * imgsPerThread), imgSize * DIVUP(numOutputs, threads.y));
|
||||
bool checkCaseBounds = numImages % (threads.x*imgsPerThread) != 0;
|
||||
if (!checkCaseBounds) {
|
||||
if (imgsPerThread == 4) {
|
||||
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 4, false>, cudaFuncCachePreferShared);
|
||||
kPoolCrossMap<Pooler, 4, 32, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
|
||||
|
||||
} else if (imgsPerThread == 2) {
|
||||
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 2, false>, cudaFuncCachePreferShared);
|
||||
kPoolCrossMap<Pooler, 4, 32, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
|
||||
|
||||
} else if (imgsPerThread == 1) {
|
||||
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 1, false>, cudaFuncCachePreferShared);
|
||||
kPoolCrossMap<Pooler, 4, 32, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
|
||||
}
|
||||
} else {
|
||||
if (imgsPerThread == 1) {
|
||||
cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 1, true>, cudaFuncCachePreferShared);
|
||||
kPoolCrossMap<Pooler, 4, 32, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
getLastCudaError("convPoolCrossMap: kernel execution failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* Block size 16xB_X
|
||||
* blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread
|
||||
* blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread
|
||||
*
|
||||
* So each block does a 4x4 region for some number of images/filters.
|
||||
*
|
||||
* threadIdx.x determines img idx
|
||||
* threadIdx.y determines pixel idx
|
||||
*
|
||||
* imgs: (numFilters, imgPixels, numImages)
|
||||
* target: (numFilters, numOutputs, numImages)
|
||||
*
|
||||
* B_X one of 8, 16, 32
|
||||
* imgsPerThread one of 1, 2, 4, 8, 16
|
||||
*
|
||||
* B_XximgsPerThread MUST be divisible by 32.
|
||||
* Number of filters MUST be divisible by filtersPerThread.
|
||||
*
|
||||
* numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
|
||||
*
|
||||
* Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more
|
||||
* reading than writing here, and the reading is all coalesced, so it should be OK.
|
||||
*
|
||||
* To be used when the stride is 1 and the pooling region is fairly large.
|
||||
*/
|
||||
template<class Agg, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
|
||||
__global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const int numFilters,
|
||||
const int numImages, const int subsX, const int startX,
|
||||
const int outputsX, Agg agg) {
|
||||
__shared__ float shImgs[filtersPerThread][B_X*imgsPerThread];
|
||||
const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
|
||||
const int numFilterBlocks = numFilters/(filtersPerThread);
|
||||
const int blockOutputX = 4*(blockIdx.x / numImgBlocks);
|
||||
const int blockOutputY = 4*(blockIdx.y / numFilterBlocks);
|
||||
const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
|
||||
const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
|
||||
|
||||
// const int blockOutputIdx = blockOutputY * outputsX + blockOutputX;
|
||||
const int numOutputs = outputsX * outputsX;
|
||||
const int imgPixels = imgSize * imgSize;
|
||||
|
||||
const int tidx = threadIdx.y * B_X + threadIdx.x;
|
||||
const int loadY = tidx / 32, loadX = tidx % 32;
|
||||
|
||||
const int myX = threadIdx.y % 4;
|
||||
const int myY = threadIdx.y / 4;
|
||||
|
||||
const int myOutputIdxY = blockOutputY + myY;
|
||||
const int myOutputIdxX = blockOutputX + myX;
|
||||
const int myOutputIdx = myOutputIdxY * outputsX + myOutputIdxX;
|
||||
|
||||
const int startImgPxX = startX + blockOutputX;
|
||||
const int startImgPxY = startX + blockOutputY;
|
||||
const int endImgPxX = startImgPxX + subsX;
|
||||
const int endImgPxY = startImgPxY + subsX;
|
||||
|
||||
const int myStartImgPxY = startImgPxY + myY;
|
||||
const int myStartImgPxX = startImgPxX + myX;
|
||||
const int myEndImgPxY = endImgPxY + myY;
|
||||
const int myEndImgPxX = endImgPxX + myX;
|
||||
|
||||
const int loopStartY = MAX(startImgPxY, 0);
|
||||
const int loopStartX = MAX(startImgPxX, 0);
|
||||
const int loopEndY = MIN(imgSize, endImgPxY + 3);
|
||||
const int loopEndX = MIN(imgSize, endImgPxX + 3);
|
||||
|
||||
const int imgIdx = blockImgIdx + threadIdx.x;
|
||||
|
||||
imgs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
|
||||
target += (blockFilterIdx * numOutputs + myOutputIdx) * numImages + imgIdx;
|
||||
|
||||
float prod[filtersPerThread][imgsPerThread];
|
||||
#pragma unroll
|
||||
for (int f = 0; f < filtersPerThread; f++) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
prod[f][i] = agg.getBaseValue();
|
||||
}
|
||||
}
|
||||
int regionSize = 0;
|
||||
for (int y = loopStartY; y < loopEndY; y++) {
|
||||
const bool isInY = y >= myStartImgPxY && y < myEndImgPxY ;
|
||||
for (int x = loopStartX; x < loopEndX; x++) {
|
||||
// Load a pixel
|
||||
const int px = y * imgSize + x;
|
||||
#pragma unroll
|
||||
for (int ly = 0; ly < filtersPerThread; ly += B_X/2) {
|
||||
if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) {
|
||||
#pragma unroll
|
||||
for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) {
|
||||
if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
|
||||
shImgs[ly + loadY][lx + loadX] = imgs[(ly * imgPixels + px) * numImages + lx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Is this pixel in my region?
|
||||
if (isInY && x >= myStartImgPxX && x < myEndImgPxX) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
|
||||
#pragma unroll
|
||||
for (int f = 0; f < filtersPerThread; f++) {
|
||||
prod[f][i] = agg(prod[f][i], shImgs[f][threadIdx.x + i * B_X]);
|
||||
}
|
||||
}
|
||||
}
|
||||
++regionSize;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
}
|
||||
if (myOutputIdxY < outputsX && myOutputIdxX < outputsX) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < imgsPerThread; i++) {
|
||||
if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
|
||||
#pragma unroll
|
||||
for (int f = 0; f < filtersPerThread; f++) {
|
||||
target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* imgs: (numFilters, imgPixels, numImages)
|
||||
* target: (numFilters, outputs, numImages)
|
||||
*/
|
||||
template<class Pooler>
|
||||
void convLocalPool(NVMatrix& images, NVMatrix& target, int numFilters,
|
||||
int subsX, int startX, int strideX, int outputsX, Pooler pooler) {
|
||||
int numImages = images.getNumCols();
|
||||
int imgPixels = images.getNumRows() / numFilters;
|
||||
assert(images.getNumRows() == numFilters * imgPixels);
|
||||
int imgSize = int(sqrt(imgPixels));
|
||||
assert(imgSize * imgSize == imgPixels);
|
||||
|
||||
assert(!images.isTrans());
|
||||
assert(!target.isTrans());
|
||||
assert(images.isContiguous());
|
||||
// assert(numFilters % 4 == 0);
|
||||
// assert(numImages % 128 == 0);
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
int outputs = outputsX * outputsX;
|
||||
target.resize(numFilters*outputs, numImages);
|
||||
|
||||
if (strideX == 1 && subsX >= 6 && outputsX > 1) {
|
||||
// NOTE: this part has not been optimized for Kepler
|
||||
int imgsPerThread = numImages % 128 == 0 ? 8 : 4;
|
||||
int filtersPerThread = numFilters % 4 == 0 ? 4 : numFilters % 3 == 0 ? 3 : numFilters % 2 == 0 ? 2 : 1;
|
||||
int bx = 8;
|
||||
bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0;
|
||||
assert((imgsPerThread * bx) % 32 == 0);
|
||||
assert(numFilters % filtersPerThread == 0);
|
||||
dim3 threads(bx, 16);
|
||||
dim3 blocks(DIVUP(outputsX, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(outputsX, 4) * numFilters / filtersPerThread);
|
||||
// printf("threads: %dx%d, blocks: %dx%d, imgSize: %d, numFilters: %d, numImages: %d, subsX: %d, startX: %d, outputsX: %d\n",
|
||||
// threads.y, threads.x, blocks.y, blocks.x, imgSize, numFilters, numImages, subsX, startX, outputsX);
|
||||
if (imgsPerThread == 8) {
|
||||
if (filtersPerThread == 1) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 1, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 1, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
} else if (filtersPerThread == 2) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 2, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 2, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 2, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
} else if (filtersPerThread == 3) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 3, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 3, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 3, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 3, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
} else if (filtersPerThread == 4) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 4, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 4, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 8, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
}
|
||||
} else if (imgsPerThread == 4) {
|
||||
if (filtersPerThread == 1) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 1, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 1, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
} else if (filtersPerThread == 2) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 2, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 2, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 2, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
} else if (filtersPerThread == 3) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 3, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 3, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 3, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 3, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
} else if (filtersPerThread == 4) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 4, true>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 4, false>, cudaFuncCachePreferShared);
|
||||
kLocalPool2<Pooler, 8, 4, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int filtersPerThread = numFilters % 16 == 0 ? 4 : 1;
|
||||
int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
|
||||
bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
|
||||
dim3 threads(32, 4);
|
||||
dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numFilters, 4 * filtersPerThread) * outputsX);
|
||||
if (imgsPerThread == 4) {
|
||||
if (filtersPerThread == 1) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 1, true>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 4, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 1, false>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 4, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
}
|
||||
} else {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 4, true>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 4, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 4, false>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 4, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
}
|
||||
}
|
||||
} else if (imgsPerThread == 2) {
|
||||
if (filtersPerThread == 1) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 1, true>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 2, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 1, false>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 2, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
}
|
||||
} else {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 4, true>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 2, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 4, false>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 2, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (filtersPerThread == 1) {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 1, true>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 1, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 1, false>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 1, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
}
|
||||
} else {
|
||||
if (checkCaseBounds) {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 4, true>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 1, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 4, false>, cudaFuncCachePreferL1);
|
||||
kLocalPool<Pooler, 4, 32, 1, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
|
||||
imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
getLastCudaError("convLocalPool: kernel execution failed");
|
||||
}
|
||||
|
||||
#endif /* CONV_UTIL_CUH */
|
||||
|
197
caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh
Normal file
197
caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh
Normal file
@ -0,0 +1,197 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef COMMON_CUH
|
||||
#define COMMON_CUH
|
||||
|
||||
#include <helper_cuda.h> // helper functions CUDA error checking and initialization
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "conv_util.cuh"
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
|
||||
enum FILTER_OUTPUT_ORDER { MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE };
|
||||
|
||||
void convFilterActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups);
|
||||
void convFilterActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups,
|
||||
float scaleTargets,
|
||||
float scaleOutput);
|
||||
|
||||
void localFilterActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups);
|
||||
void localFilterActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups,
|
||||
float scaleTargets,
|
||||
float scaleOutput);
|
||||
|
||||
void convImgActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int imgSizeX,
|
||||
int numModulesY,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups);
|
||||
void convImgActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int imgSizeX,
|
||||
int numModulesY,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups,
|
||||
float scaleTargets,
|
||||
float scaleOutput);
|
||||
|
||||
void localImgActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int imgSizeX,
|
||||
int numModulesY,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups);
|
||||
void localImgActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* filters,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int imgSizeX,
|
||||
int numModulesY,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups,
|
||||
float scaleTargets,
|
||||
float scaleOutput);
|
||||
|
||||
void convWeightActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int filterSize,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups,
|
||||
int sumWidth);
|
||||
void convWeightActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int filterSize,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups,
|
||||
int sumWidth,
|
||||
float scaleTargets,
|
||||
float scaleOutput);
|
||||
|
||||
void localWeightActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int filterSize,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups);
|
||||
|
||||
void localWeightActs(
|
||||
caffe2::CUDAContext* context,
|
||||
caffe2::TensorCUDA* images,
|
||||
caffe2::TensorCUDA* hidActs,
|
||||
caffe2::TensorCUDA* targets,
|
||||
int imgSizeY,
|
||||
int numModulesY,
|
||||
int numModulesX,
|
||||
int filterSize,
|
||||
int paddingStart,
|
||||
int moduleStride,
|
||||
int numImgColors,
|
||||
int numGroups,
|
||||
float scaleTargets,
|
||||
float scaleOutput);
|
||||
|
||||
#endif /* COMMON_CUH */
|
3047
caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu
Normal file
3047
caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu
Normal file
File diff suppressed because it is too large
Load Diff
2281
caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu
Normal file
2281
caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu
Normal file
File diff suppressed because it is too large
Load Diff
2711
caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu
Normal file
2711
caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu
Normal file
File diff suppressed because it is too large
Load Diff
2744
caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu
Normal file
2744
caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu
Normal file
File diff suppressed because it is too large
Load Diff
112
caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile
Normal file
112
caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile
Normal file
@ -0,0 +1,112 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# NOTICE TO USER:
|
||||
#
|
||||
# This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
# international Copyright laws.
|
||||
#
|
||||
# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
# OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
#
|
||||
# U.S. Government End Users. This source code is a "commercial item" as
|
||||
# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
# "commercial computer software" and "commercial computer software
|
||||
# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
# and is provided to the U.S. Government only as a commercial end item.
|
||||
# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
# source code with only those rights set forth herein.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit binaries and libraries
|
||||
CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include
|
||||
CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin
|
||||
CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64
|
||||
|
||||
# Common binaries
|
||||
NVCC = $(CUDA_BIN_PATH)/nvcc
|
||||
GCC = g++
|
||||
AR = ar
|
||||
|
||||
# CUDA code generation flags
|
||||
GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
|
||||
GENCODE_FLAGS := $(GENCODE_SM35)
|
||||
|
||||
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
|
||||
CCFLAGS := -m64
|
||||
NVCCFLAGS := -m64
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
CCFLAGS += -g
|
||||
NVCCFLAGS += -g -G
|
||||
DBG := debug
|
||||
else
|
||||
DBG := release
|
||||
NVCCFLAGS += -O3
|
||||
CCFLAGS += -O3
|
||||
endif
|
||||
|
||||
# Add profiler output
|
||||
ifeq ($(prof),1)
|
||||
NVCCFLAGS += --ptxas-options=-v
|
||||
endif
|
||||
|
||||
TARGETDIR := ./bin/$(DBG)
|
||||
OBJDIR := ./obj/$(DBG)
|
||||
|
||||
########## USER STUFF ###########
|
||||
PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
|
||||
MODELNAME := _ConvNet
|
||||
LDFLAGS += -lpthread -ljpeg -lpython$(PYTHON_VERSION) -L../util -lutilpy -L../nvmatrix -lnvmatrix -L../cudaconv3 -lcudaconv -lcublas -Wl,-rpath=./util -Wl,-rpath=./nvmatrix -Wl,-rpath=./cudaconv3
|
||||
INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH)
|
||||
|
||||
DEFINES := -DNUMPY_INTERFACE
|
||||
|
||||
CUFILES := $(shell find . -name "*.cu")
|
||||
CU_DEPS := $(shell find . -name "*.cuh")
|
||||
CCFILES := $(shell find . -name "*.cpp")
|
||||
C_DEPS := $(shell find . -name "*.h")
|
||||
|
||||
NVCCFLAGS += --compiler-options '-fPIC'
|
||||
LDFLAGS += -shared
|
||||
CCFLAGS += -fPIC
|
||||
TARGET := $(TARGETDIR)/$(MODELNAME).so
|
||||
|
||||
################################################################################
|
||||
# Set up target and object files
|
||||
################################################################################
|
||||
OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
|
||||
OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
|
||||
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
|
||||
|
||||
# Target rules
|
||||
all: makedirs $(TARGET)
|
||||
|
||||
$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
|
||||
$(NVCC) $(DEFINES) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
|
||||
|
||||
$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
|
||||
$(GCC) $(DEFINES) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
|
||||
|
||||
$(TARGET): $(OBJS)
|
||||
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
|
||||
ln -sf $(TARGET) .
|
||||
|
||||
makedirs:
|
||||
mkdir -p $(TARGETDIR)
|
||||
mkdir -p $(OBJDIR)/src
|
||||
|
||||
clean:
|
||||
rm -rf ./obj
|
@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef ACTBROADCASTER_CUH_H_
|
||||
#define ACTBROADCASTER_CUH_H_
|
||||
|
||||
#include <map>
|
||||
#include "streambroadcast.cuh"
|
||||
#include "copypipeline.cuh"
|
||||
|
||||
class BroadcastMessage {
|
||||
public:
|
||||
enum MESSAGE_TYPE {
|
||||
BROADCAST,
|
||||
EXIT
|
||||
};
|
||||
protected:
|
||||
int _srcDevice;
|
||||
std::map<int, NVMatrix*> _mats;
|
||||
int _userIdx;
|
||||
Queue<int>* _finishQueue;
|
||||
MESSAGE_TYPE _type;
|
||||
BroadcastMessage(MESSAGE_TYPE type);
|
||||
public:
|
||||
BroadcastMessage(std::map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue);
|
||||
|
||||
int getSrcDevice();
|
||||
std::map<int, NVMatrix*>& getMatrices();
|
||||
int getUserIdx();
|
||||
Queue<int>& getFinishQueue();
|
||||
MESSAGE_TYPE getMessageType();
|
||||
};
|
||||
|
||||
class ExitBroadcastMessage : public BroadcastMessage {
|
||||
public:
|
||||
ExitBroadcastMessage();
|
||||
};
|
||||
|
||||
class ActBroadcaster : public Thread {
|
||||
protected:
|
||||
std::map<int,IBroadcastNetwork*> _broadcasters; // src device --> broadcaster
|
||||
Queue<BroadcastMessage*> _messageQueue;
|
||||
int _numUsers;
|
||||
public:
|
||||
ActBroadcaster(int numUsers, intv& cpus);
|
||||
~ActBroadcaster();
|
||||
Queue<BroadcastMessage*>& getMessageQueue();
|
||||
virtual void* run();
|
||||
void stop();
|
||||
};
|
||||
|
||||
|
||||
#endif /* ACTBROADCASTER_CUH_H_ */
|
180
caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh
Normal file
180
caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh
Normal file
@ -0,0 +1,180 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef CONVNET3
|
||||
#define CONVNET3
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <helper_cuda.h>
|
||||
#include <time.h>
|
||||
#include "../../util/include/queue.h"
|
||||
#include "../../util/include/thread.h"
|
||||
#include <math.h>
|
||||
#include "../../util/include/sync.h"
|
||||
#include "messages.cuh"
|
||||
#include "streambroadcast.cuh"
|
||||
|
||||
#include "layer.cuh"
|
||||
#include "data.cuh"
|
||||
#include "worker.cuh"
|
||||
#include "weights.cuh"
|
||||
#include "pipedispenser.cuh"
|
||||
#include "timer.cuh"
|
||||
|
||||
class Worker;
|
||||
class WorkResult;
|
||||
class Layer;
|
||||
class DataLayer;
|
||||
class CostLayer;
|
||||
class ConvNetThread;
|
||||
class StreamBroadcast;
|
||||
class Weights;
|
||||
|
||||
// name -> device id -> layer*
|
||||
typedef std::map<std::string,std::map<int, Layer*> > NameReplicaLayerMap;
|
||||
typedef std::map<std::string, Layer*> NameLayerMap;
|
||||
// name -> ReplicaMap
|
||||
//typedef std::map<int,NameLayerMap> ReplicaNameLayerMap;
|
||||
typedef std::vector<ConvNetThread*> ConvNetThreadV;
|
||||
typedef std::vector<DataLayer*> DataLayerVector;
|
||||
//typedef std::map<int,ConvNetThreadV> ReplicaThreadsMap;
|
||||
|
||||
class ConvNet : public Thread {
|
||||
private:
|
||||
void checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights);
|
||||
protected:
|
||||
NameReplicaLayerMap _layerMap;
|
||||
DataLayerVector _dataLayers;
|
||||
// Vector of convnet threads (one thread == one GPU)
|
||||
ConvNetThreadV _convNetThreads;
|
||||
|
||||
DataProvider* _dp;
|
||||
CPUData* _data, *_bufferData;
|
||||
int _bufferMinibatchIdx, _bufferPassIdx;
|
||||
ThreadSynchronizer* _sync;
|
||||
intv _deviceIDs;
|
||||
|
||||
Queue<Worker*> _workerQueue;
|
||||
Queue<WorkResult*> _resultQueue;
|
||||
Queue<Message*> _msgQueue;
|
||||
|
||||
int _numFwdTerminal;
|
||||
std::map<int, int> _numBwdTerminal; // pass idx -> #terminal
|
||||
int _totalPassesDone;
|
||||
int _numReplicasMin, _numReplicasMax;
|
||||
// For gradient checking
|
||||
int _numFailures;
|
||||
int _numTests;
|
||||
|
||||
// Training progress (between 0 and 1).
|
||||
// Used to determine learning rate based on ParameterSchedule.
|
||||
double _trainingProgress;
|
||||
double _baseErr;
|
||||
bool _conserveMem;
|
||||
PipeDispenser *_dataCopyPD;
|
||||
|
||||
void waitForTerminals(int numMsgs, MESSAGES msg);
|
||||
void sendMessage(MESSAGES msg, bool sync);
|
||||
void sendMessage(Message* msg, bool sync);
|
||||
void findBwdTerminal(Layer& l, std::set<Layer*>& visited, int& terminal, int passIdx);
|
||||
void connectReplicas();
|
||||
void initDataLayers(PyObjectV* layerList);
|
||||
void initGPUThreads(PyObjectV* layerList);
|
||||
void connectChildren(PyObject* layerParams);
|
||||
void* run();
|
||||
void setData(CPUData& data, int passIdx);
|
||||
void setDataFromBuffer();
|
||||
void setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx);
|
||||
public:
|
||||
ConvNet(PyObject* layerParams, intv& deviceIDs,
|
||||
int minibatchSize, bool conserveMem);
|
||||
~ConvNet();
|
||||
void stop();
|
||||
|
||||
Queue<Message*>& getMessageQueue();
|
||||
Queue<Worker*>& getWorkerQueue();
|
||||
Queue<WorkResult*>& getResultQueue();
|
||||
DataProvider& getDataProvider();
|
||||
|
||||
Layer& getLayer(std::string& name, int replicaID);
|
||||
void copyToCPU();
|
||||
void copyToGPU();
|
||||
void updateWeights(int passIdx);
|
||||
void reset(int passIdx);
|
||||
void reset();
|
||||
|
||||
void bprop(int passIdx, PASS_TYPE passType);
|
||||
void fprop(int miniIdx, int passIdx, PASS_TYPE passType);
|
||||
void fprop(CPUData& data, int passIdx, PASS_TYPE passType);
|
||||
|
||||
void setTrainingProgress(double progress);
|
||||
double getTrainingProgress() const;
|
||||
|
||||
bool checkGradient(const std::string& name, float eps, Weights& weights);
|
||||
void checkGradients();
|
||||
Cost& getCost();
|
||||
Cost& getCost(Cost& cost);
|
||||
CPUData& getData(); // Returns last minibatch fpropped
|
||||
double getCostValue();
|
||||
intv& getDeviceIDs();
|
||||
ThreadSynchronizer& getSync();
|
||||
void syncWithChildren();
|
||||
int getMinibatchSize();
|
||||
bool isConserveMemory();
|
||||
int getNumReplicasMax();
|
||||
int getNumReplicasMin();
|
||||
int getNumPasses();
|
||||
int getTotalPassesDone();
|
||||
PipeDispenser& getDataCopyPD();
|
||||
};
|
||||
|
||||
class ConvNetThread : public Thread {
|
||||
protected:
|
||||
NameLayerMap _nameLayerMap;
|
||||
std::vector<CostLayer*> _costs;
|
||||
ConvNet* _convNet;
|
||||
int _deviceID;
|
||||
Queue<Message*> _msgQueue;
|
||||
Timer _timer;
|
||||
// StreamBroadcast* _weightSynchronizer;
|
||||
|
||||
void initCuda();
|
||||
virtual void initLayer(PyObject* paramsDict, int replicaID);
|
||||
void* run();
|
||||
public:
|
||||
ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet);
|
||||
~ConvNetThread();
|
||||
|
||||
NameLayerMap& getLayerMap();
|
||||
int getDeviceID();
|
||||
|
||||
ConvNet& getConvNet();
|
||||
|
||||
Queue<Message*>& getMessageQueue();
|
||||
std::vector<CostLayer*>& getCostLayers();
|
||||
// StreamBroadcast& getWeightSynchronizer();
|
||||
|
||||
Cost& getCost();
|
||||
Layer& getLayer(std::string& name);
|
||||
void startTimer();
|
||||
double stopTimer();
|
||||
};
|
||||
|
||||
#endif /* CONVNET */
|
||||
|
@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef COPYPIPELINE_CUH_
|
||||
#define COPYPIPELINE_CUH_
|
||||
|
||||
#include <set>
|
||||
#include "../../util/include/thread.h"
|
||||
#include "../../util/include/queue.h"
|
||||
#include <helper_cuda.h>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "util.cuh"
|
||||
|
||||
#define COPY_MIN_CHUNK_SIZE (1<<18) // 256k
|
||||
#define COPY_MAX_CHUNKS 16
|
||||
#define COPY_MIN_CHUNKS 2
|
||||
|
||||
class CopyPeer;
|
||||
class CopySource;
|
||||
class ICopySegment;
|
||||
class IBroadcastNetwork;
|
||||
|
||||
class CopyMessage {
|
||||
protected:
|
||||
std::map<int,NVMatrix*>* _mats;
|
||||
float _scaleSource, _scaleTargets;
|
||||
public:
|
||||
enum COPY_MESSAGE_TYPE {
|
||||
COPY_CHUNK,
|
||||
COPY_START,
|
||||
EXIT
|
||||
};
|
||||
CopyMessage(COPY_MESSAGE_TYPE msgType, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
|
||||
: _msgType(msgType), _scaleSource(scaleSource), _scaleTargets(scaleTargets), _mats(&mats) {
|
||||
}
|
||||
CopyMessage(COPY_MESSAGE_TYPE msgType)
|
||||
: _msgType(msgType), _scaleSource(0), _scaleTargets(0), _mats(NULL) {
|
||||
}
|
||||
inline COPY_MESSAGE_TYPE getType() const {
|
||||
return _msgType;
|
||||
}
|
||||
inline NVMatrix& getMatrix(int deviceID) const {
|
||||
return *_mats->at(deviceID);
|
||||
}
|
||||
inline std::map<int,NVMatrix*>& getMatrices() const {
|
||||
return *_mats;
|
||||
}
|
||||
inline float getScaleSource() const {
|
||||
return _scaleSource;
|
||||
}
|
||||
inline float getScaleTargets() const {
|
||||
return _scaleTargets;
|
||||
}
|
||||
protected:
|
||||
COPY_MESSAGE_TYPE _msgType;
|
||||
};
|
||||
|
||||
class CopyChunkMessage : public CopyMessage {
|
||||
protected:
|
||||
int _chunkIdx;
|
||||
int _chunkSize;
|
||||
int _numChunks;
|
||||
public:
|
||||
CopyChunkMessage(int chunkIdx, int chunkSize, int numChunks, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
|
||||
: _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), CopyMessage(COPY_CHUNK, scaleSource, scaleTargets, mats) {
|
||||
}
|
||||
|
||||
inline int getChunkIdx() const {
|
||||
return _chunkIdx;
|
||||
}
|
||||
inline int getChunkSize() const {
|
||||
return _chunkSize;
|
||||
}
|
||||
inline int getNumChunks() const {
|
||||
return _numChunks;
|
||||
}
|
||||
};
|
||||
|
||||
class CopyStartMessage : public CopyMessage {
|
||||
public:
|
||||
CopyStartMessage(float scaleSource, float scaleTargets, std::map<int,NVMatrix*>& mats) : CopyMessage(COPY_START, scaleSource, scaleTargets, mats) {
|
||||
}
|
||||
};
|
||||
|
||||
class ICopySegment : public Thread {
|
||||
protected:
|
||||
int _deviceID, _execDeviceID;
|
||||
cudaStream_t _stream;
|
||||
ICopySegment* _prev;
|
||||
std::vector<CopyPeer*> _next;
|
||||
Queue<CopyMessage*> _queue;
|
||||
Queue<int>* _finishQueue;
|
||||
HostNVMatrix _hmat;
|
||||
IBroadcastNetwork* _parent;
|
||||
|
||||
NVMatrix& getChunk(NVMatrix& mat, int chunkSize, int chunkIdx);
|
||||
void* run();
|
||||
virtual bool processMessage(CopyMessage& msg) = 0;
|
||||
|
||||
public:
|
||||
ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
|
||||
virtual ~ICopySegment();
|
||||
inline NVMatrix& getMatrix(CopyMessage& msg);
|
||||
Queue<CopyMessage*>& getQueue();
|
||||
inline int getDeviceID();
|
||||
void addPrev(ICopySegment& c);
|
||||
void addNext(CopyPeer& c);
|
||||
bool isTerminal() const;
|
||||
virtual bool isSource() const = 0;
|
||||
};
|
||||
|
||||
class CopySource : public ICopySegment {
|
||||
protected:
|
||||
bool processMessage(CopyMessage& msg);
|
||||
public:
|
||||
CopySource(IBroadcastNetwork& parent, int deviceID);
|
||||
inline bool isSource() const;
|
||||
};
|
||||
|
||||
class CopyPeer : public ICopySegment {
|
||||
protected:
|
||||
bool processMessage(CopyMessage& msg);
|
||||
public:
|
||||
CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
|
||||
inline bool isSource() const;
|
||||
};
|
||||
|
||||
class IBroadcastNetwork {
|
||||
protected:
|
||||
Queue<int> _finishQueue;
|
||||
CopySource* _src;
|
||||
std::vector<CopyPeer*> _peers;
|
||||
int _srcDeviceID, _numTerminal;
|
||||
bool _constructed;
|
||||
std::set<int> _devices;
|
||||
std::pair<std::vector<int>,std::vector<int> > makeGPULists();
|
||||
|
||||
void makePeers(std::pair<std::vector<int>,std::vector<int> >& gpus);
|
||||
virtual void makeConnections() = 0;
|
||||
virtual void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
|
||||
IBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
|
||||
public:
|
||||
virtual IBroadcastNetwork& construct();
|
||||
virtual ~IBroadcastNetwork();
|
||||
|
||||
virtual void broadcast(std::map<int, NVMatrix*>& mats);
|
||||
int getSourceDeviceID() const;
|
||||
static IBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
|
||||
};
|
||||
|
||||
class ISafeBroadcastNetwork : public IBroadcastNetwork {
|
||||
protected:
|
||||
ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
|
||||
public:
|
||||
virtual void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
|
||||
virtual ISafeBroadcastNetwork& construct();
|
||||
static ISafeBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
|
||||
};
|
||||
|
||||
class NullBroadcaster : public ISafeBroadcastNetwork {
|
||||
protected:
|
||||
NullBroadcaster(std::set<int>& devices, int srcDeviceID);
|
||||
void makeConnections();
|
||||
public:
|
||||
NullBroadcaster& construct();
|
||||
void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
|
||||
void broadcast(std::map<int, NVMatrix*>& mats);
|
||||
friend class IBroadcastNetwork;
|
||||
friend class ISafeBroadcastNetwork;
|
||||
};
|
||||
|
||||
/*
|
||||
* This one goes to host and then to targets.
|
||||
*/
|
||||
class NaiveBroadcaster : public ISafeBroadcastNetwork {
|
||||
protected:
|
||||
NaiveBroadcaster(std::set<int>& devices, int srcDeviceID);
|
||||
void makeConnections();
|
||||
friend class IBroadcastNetwork;
|
||||
friend class ISafeBroadcastNetwork;
|
||||
};
|
||||
|
||||
class EightGPUBroadcaster1 : public IBroadcastNetwork {
|
||||
protected:
|
||||
EightGPUBroadcaster1(std::set<int>& devices, int srcDeviceID);
|
||||
void makeConnections();
|
||||
friend class IBroadcastNetwork;
|
||||
};
|
||||
|
||||
class TwoPeeringGPUsBroadcaster : public ISafeBroadcastNetwork {
|
||||
protected:
|
||||
int _tgtDeviceID;
|
||||
cudaStream_t _tgtStream;
|
||||
void makeConnections();
|
||||
void resetDeviceID(int d);
|
||||
void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
|
||||
public:
|
||||
TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID);
|
||||
~TwoPeeringGPUsBroadcaster();
|
||||
ISafeBroadcastNetwork& construct();
|
||||
friend class IBroadcastNetwork;
|
||||
friend class ISafeBroadcastNetwork;
|
||||
};
|
||||
|
||||
#endif /* COPYPIPELINE_CUH_ */
|
56
caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh
Normal file
56
caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh
Normal file
@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef COST_CUH
|
||||
#define COST_CUH
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
#include "layer.cuh"
|
||||
#include "util.cuh"
|
||||
|
||||
class CostLayer;
|
||||
|
||||
/*
|
||||
* Wrapper for dictionary mapping cost name to vector of returned values.
|
||||
*/
|
||||
class Cost {
|
||||
protected:
|
||||
std::map<std::string,int> _numCases;
|
||||
CostMap _costMap;
|
||||
CostCoeffMap _costCoeffMap;
|
||||
std::map<std::string,int>& getNumCasesMap();
|
||||
public:
|
||||
Cost();
|
||||
Cost(std::vector<CostLayer*>& costs);
|
||||
doublev& operator [](const std::string s);
|
||||
CostMap& getCostMap();
|
||||
CostCoeffMap& getCostCoeffMap();
|
||||
int getNumCases();
|
||||
/*
|
||||
* Returns sum of first values returned by all the CostLayers, weighted by the cost coefficients.
|
||||
*/
|
||||
double getValue();
|
||||
Cost& operator += (Cost& er);
|
||||
virtual ~Cost();
|
||||
void print();
|
||||
};
|
||||
|
||||
|
||||
#endif /* COST_CUH */
|
||||
|
101
caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh
Normal file
101
caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh
Normal file
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef DATA_CUH
|
||||
#define DATA_CUH
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "util.cuh"
|
||||
|
||||
class CPUData {
|
||||
protected:
|
||||
MatrixV* _data;
|
||||
void assertDimensions() {
|
||||
assert(_data->size() > 0);
|
||||
for (int i = 1; i < _data->size(); i++) {
|
||||
assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols());
|
||||
if (_data->at(i-1)->isTrans() != _data->at(i)->isTrans() && _data->at(i)->getNumElements() < 2) {
|
||||
_data->at(i)->setTrans(_data->at(i-1)->isTrans());
|
||||
}
|
||||
assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans());
|
||||
}
|
||||
assert(_data->at(0)->getNumCols() > 0);
|
||||
}
|
||||
public:
|
||||
typedef typename MatrixV::iterator T_iter;
|
||||
// Cases in columns, but array may be transposed
|
||||
// (so in memory they can really be in rows -- in which case the array is transposed
|
||||
// during the copy to GPU).
|
||||
CPUData(PyObject* pyData) {
|
||||
_data = getMatrixV(pyData);
|
||||
assertDimensions();
|
||||
}
|
||||
|
||||
CPUData(MatrixV* data) : _data(data) {
|
||||
assertDimensions();
|
||||
}
|
||||
|
||||
~CPUData() {
|
||||
for (T_iter it = _data->begin(); it != _data->end(); ++it) {
|
||||
delete *it;
|
||||
}
|
||||
delete _data;
|
||||
}
|
||||
|
||||
Matrix& operator [](int idx) const {
|
||||
return *_data->at(idx);
|
||||
}
|
||||
|
||||
int getSize() const {
|
||||
return _data->size();
|
||||
}
|
||||
|
||||
MatrixV& getData() const {
|
||||
return *_data;
|
||||
}
|
||||
|
||||
Matrix& getData(int i) const {
|
||||
return *_data->at(i);
|
||||
}
|
||||
|
||||
bool isTrans() const {
|
||||
return _data->at(0)->isTrans();
|
||||
}
|
||||
|
||||
int getNumCases() const {
|
||||
return _data->at(0)->getNumCols();
|
||||
}
|
||||
};
|
||||
|
||||
class DataProvider {
|
||||
protected:
|
||||
CPUData* _hData;
|
||||
NVMatrixV _data;
|
||||
int _minibatchSize;
|
||||
public:
|
||||
DataProvider(int minibatchSize);
|
||||
void setData(CPUData&);
|
||||
void clearData();
|
||||
CPUData& getMinibatch(int idx);
|
||||
CPUData& getDataSlice(int startCase, int endCase);
|
||||
int getNumMinibatches();
|
||||
int getMinibatchSize();
|
||||
int getNumCases();
|
||||
};
|
||||
|
||||
#endif /* DATA_CUH */
|
||||
|
@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef GRADREDUCER_CUH_
|
||||
#define GRADREDUCER_CUH_
|
||||
|
||||
#include <set>
|
||||
#include <algorithm>
|
||||
#include "streambroadcast.cuh"
|
||||
#include "reducepipeline.cuh"
|
||||
#include "layer.cuh"
|
||||
#include "util.cuh"
|
||||
|
||||
class StreamBroadcast;
|
||||
class Layer;
|
||||
|
||||
#define ACT_GRAD_REDUCER_EXIT (1 << 16)
|
||||
|
||||
//class ReduceMessage {
|
||||
// ReduceMessage();
|
||||
// ReduceMessage(bool exit);
|
||||
//};
|
||||
|
||||
class IActGradReducer : public Thread {
|
||||
protected:
|
||||
Layer* _parent;
|
||||
Queue<int> _finishQueue;
|
||||
int _numExpectedMsgsTotal;
|
||||
std::map<int,int> _numExpectedMsgs; // map from device id -> num expected msgs
|
||||
|
||||
void* run();
|
||||
virtual bool reduce() = 0;
|
||||
virtual void reset() = 0;
|
||||
public:
|
||||
IActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
|
||||
virtual ~IActGradReducer();
|
||||
int waitForFinish();
|
||||
virtual void enqueueReduction(int deviceID) = 0;
|
||||
virtual void stop() = 0;
|
||||
static IActGradReducer& makeGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
|
||||
};
|
||||
|
||||
class SequentialActGradReducer : public IActGradReducer {
|
||||
protected:
|
||||
|
||||
std::map<int,int> _numReceivedMsgs; // map from device id -> num received msgs
|
||||
|
||||
std::map<int,Queue<int>* > _messageQueues;
|
||||
intv _deviceIDs;
|
||||
StreamBroadcast* _broadcaster;
|
||||
bool reduce();
|
||||
void reset();
|
||||
public:
|
||||
SequentialActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
|
||||
~SequentialActGradReducer();
|
||||
void enqueueReduction(int deviceID);
|
||||
void stop();
|
||||
};
|
||||
|
||||
class ParallelActGradReducer : public IActGradReducer {
|
||||
protected:
|
||||
IEightGPUReducer* _reducer;
|
||||
int _numReceivedMsgs;
|
||||
float _scaleTarget;
|
||||
Queue<int> _messageQueue;
|
||||
bool reduce();
|
||||
void reset();
|
||||
public:
|
||||
ParallelActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
|
||||
void enqueueReduction(int deviceID);
|
||||
void stop();
|
||||
};
|
||||
|
||||
|
||||
#endif /* GRADREDUCER_CUH_ */
|
61
caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h
Normal file
61
caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef JPEG_MAIN_H
|
||||
#define JPEG_MAIN_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <Python.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <jpeglib.h>
|
||||
//#include <arrayobject.h>
|
||||
#include "../../util/include/thread.h"
|
||||
#include "../../util/include/matrix.h"
|
||||
|
||||
#ifndef DIVUP
|
||||
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
|
||||
#endif
|
||||
|
||||
#define NUM_JPEG_DECODER_THREADS 4
|
||||
|
||||
|
||||
class DecoderThread : public Thread {
|
||||
protected:
|
||||
PyObject* _pyList;
|
||||
Matrix* _target;
|
||||
int64 _start_img, _end_img;
|
||||
int64 _img_size, _inner_size, _inner_pixels;
|
||||
bool _test, _multiview;
|
||||
|
||||
unsigned char* _decodeTarget;
|
||||
int64 _decodeTargetSize;
|
||||
unsigned int _rseed;
|
||||
|
||||
void* run();
|
||||
void decodeJpeg(int idx, int& width, int& height);
|
||||
double randUniform();
|
||||
double randUniform(double min, double max);
|
||||
void crop(int64 i, int64 width, int64 height, bool flip);
|
||||
virtual void crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y);
|
||||
public:
|
||||
DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview);
|
||||
virtual ~DecoderThread();
|
||||
};
|
||||
|
||||
#endif // JPEG_MAIN_H
|
812
caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh
Normal file
812
caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh
Normal file
@ -0,0 +1,812 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef LAYER_CUH
|
||||
#define LAYER_CUH
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <assert.h>
|
||||
#include <helper_timer.h>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
//#include "experimental/akrizhevsky/g3/mactruck-gpu-tests/gpu_util.cuh"
|
||||
|
||||
#include "weights.cuh"
|
||||
#include "convnet.cuh"
|
||||
#include "cost.cuh"
|
||||
#include "neuron.cuh"
|
||||
#include "data.cuh"
|
||||
#include "layer_kernels.cuh"
|
||||
#include "streambroadcast.cuh"
|
||||
#include "actbroadcaster.cuh"
|
||||
#include "gradreducer.cuh"
|
||||
#include "util.cuh"
|
||||
#include "timer.cuh"
|
||||
#include "memorysource.cuh"
|
||||
|
||||
class Cost;
|
||||
class ConvNet;
|
||||
class ConvNetThread;
|
||||
class CostLayer;
|
||||
class DataLayer;
|
||||
class Layer;
|
||||
class ActBroadcaster;
|
||||
class BroadcastMessage;
|
||||
class IActGradReducer;
|
||||
class Weights;
|
||||
class WeightList;
|
||||
typedef std::vector<Layer*> LayerV;
|
||||
|
||||
class BinomialCrossEntOperator {
|
||||
protected:
|
||||
float _posWeight;
|
||||
public:
|
||||
BinomialCrossEntOperator(float posWeight) : _posWeight(posWeight) {
|
||||
}
|
||||
__device__ inline float operator()(const float t, const float y) const {
|
||||
return _posWeight * t * safelog(y) + (1.0f - t) * safelog(1.0f - y);
|
||||
}
|
||||
};
|
||||
|
||||
class CrossEntOperator {
|
||||
protected:
|
||||
float _posWeight;
|
||||
public:
|
||||
CrossEntOperator(float posWeight) : _posWeight(posWeight) {
|
||||
}
|
||||
__device__ inline float operator()(const float t, const float y) const {
|
||||
return _posWeight * t * safelog(y);
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* Abstract layer.
|
||||
*/
|
||||
class Layer {
|
||||
protected:
|
||||
ConvNetThread* _convNetThread;
|
||||
|
||||
// This is a vector[#layers_next]
|
||||
std::vector<Layer*> _next;
|
||||
// This is a vector[#replicas_prev][#layers_prev]
|
||||
std::map<int, std::vector<Layer*> > _prev;
|
||||
|
||||
int _rcvdFInputMsgs;
|
||||
std::map<int, int> _numComputedActsGrads;
|
||||
int _rcvdBInputMsgs;
|
||||
int _numOutputs;
|
||||
std::map<int, NVMatrix*> _inputs; // input idx -> matrix
|
||||
std::map<int, MemoryView*> _memSrcActs; // device id -> memory source
|
||||
std::map<int, MemoryView*> _memSrcActsGrad; // device id -> memory source
|
||||
|
||||
bool _gradConsumer, _foundGradConsumers, _trans;
|
||||
std::map<int,bool> _bwdTerminal; // One bool per pass
|
||||
int _numGradProducersNext;
|
||||
int _actsTarget, _actsGradTarget;
|
||||
std::string _name, _type;
|
||||
intv _nextDeviceIDs, _prevDeviceIDs;
|
||||
HostNVMatrix _hostMemFwd;
|
||||
|
||||
// New replica-related stuff:
|
||||
std::map<int,Layer*> _replicas; // NOTE: a layer is its own sibling, too
|
||||
// Previous layers sorted by device ID, in reverse order in which they are procesed by
|
||||
// sequential grad reducer. map from replica -> device id -> layers
|
||||
std::map<int,std::map<int,std::set<Layer*> > > _prevByDevice;
|
||||
std::map<std::string, int> _inputIndices;
|
||||
int _replicaID;
|
||||
int _numReplicas;
|
||||
int _numReplicasPrev, _numReplicasNext;
|
||||
|
||||
Queue<int> _broadcastFinishQueue;
|
||||
Queue<int> _reductionFinishQueue;
|
||||
ActBroadcaster* _actBroadcaster;
|
||||
IActGradReducer* _gradReducer;
|
||||
Timer _timer;
|
||||
bool _initialized;
|
||||
|
||||
virtual void fpropNext(PASS_TYPE passType, int passIdx);
|
||||
virtual void truncBwdActs();
|
||||
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) = 0;
|
||||
|
||||
virtual void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) {
|
||||
// Do nothing by default
|
||||
}
|
||||
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
|
||||
assert(!isGradProducer()); // Only do nothing if not grad producer
|
||||
}
|
||||
virtual void fpropCommon(PASS_TYPE passType) {
|
||||
|
||||
}
|
||||
void bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx);
|
||||
|
||||
ActBroadcaster& getActBroadcaster();
|
||||
IActGradReducer& getGradReducer();
|
||||
int getInputIdx(std::string& parentName);
|
||||
void setInputIdx(std::string& parentName, int idx);
|
||||
|
||||
public:
|
||||
static bool _saveActsGrad, _saveActs;
|
||||
|
||||
Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
|
||||
virtual ~Layer();
|
||||
|
||||
virtual bool fprop(PASS_TYPE passType, int passIdx);
|
||||
void fprop(NVMatrix& v, int inpIdx, PASS_TYPE passType, int passIdx);
|
||||
virtual void fprop(std::map<int,NVMatrix*>& v, PASS_TYPE passType, int passIdx);
|
||||
virtual void bprop(PASS_TYPE passType, int passIdx);
|
||||
virtual void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
|
||||
virtual void reset();
|
||||
virtual void resetPassIdx();
|
||||
int getNumCases(NVMatrix& v);
|
||||
int& getNumComputedActsGrads(int deviceID);
|
||||
int incRcvdBInputMsgs();
|
||||
bool isGradConsumer();
|
||||
bool hasGradProducerNext(std::string& layerName);
|
||||
// Does this layer produce a gradient for any layer?
|
||||
virtual bool isGradProducer();
|
||||
// Does this layer produce a gradient for layer of given name?
|
||||
virtual bool isGradProducer(std::string& layerName);
|
||||
std::string& getName();
|
||||
std::string& getType();
|
||||
virtual void addNext(Layer& l);
|
||||
virtual void addPrev(Layer& l, int replicaIdx);
|
||||
virtual void addReplica(Layer& l);
|
||||
std::map<int,std::vector<Layer*> >& getPrev();
|
||||
std::vector<Layer*>& getNext();
|
||||
virtual NVMatrix& getActs();
|
||||
virtual NVMatrix& getActs(int deviceID);
|
||||
virtual NVMatrix& getActs(int deviceID, int numCases);
|
||||
virtual NVMatrix& getActsGrad();
|
||||
virtual NVMatrix& getActsGrad(int deviceID);
|
||||
virtual std::map<int,NVMatrix*> getAllActs();
|
||||
virtual std::map<int, NVMatrix*> getAllActsGrads();
|
||||
virtual bool postInit();
|
||||
int getDeviceID();
|
||||
ConvNetThread& getConvNetThread();
|
||||
cudaStream_t getStream();
|
||||
void syncStream();
|
||||
void setBwdTerminal(int passIdx);
|
||||
// Do nothing if this layer has no weights
|
||||
virtual bool updateWeights() {
|
||||
return false;
|
||||
}
|
||||
virtual bool constrainWeights() {
|
||||
return false;
|
||||
}
|
||||
virtual void checkGradient() {
|
||||
}
|
||||
virtual void copyToCPU() {
|
||||
}
|
||||
virtual void copyToGPU() {
|
||||
}
|
||||
intv& getNextDeviceIDs() {
|
||||
return _nextDeviceIDs;
|
||||
}
|
||||
|
||||
int getReplicaID();
|
||||
int getNumReplicas();
|
||||
int getNumSiblingReplicas();
|
||||
int getNumReplicasPrev();
|
||||
int getNumReplicasNext();
|
||||
int getNumOutputs();
|
||||
void setMemorySourceActs(int deviceID, MemoryView& mem);
|
||||
void setMemorySourceActsGrad(int deviceID, MemoryView& mem);
|
||||
MemoryView& getMemorySourceActs(int deviceID);
|
||||
MemoryView& getMemorySourceActsGrad(int deviceID);
|
||||
int getFwdActiveInputReplicaIdx(int passIdx);
|
||||
int getBwdActiveInputReplicaIdx(int passIdx);
|
||||
int getFwdActiveReplicaIdx(int passIdx);
|
||||
int getNumLayersPrev();
|
||||
virtual int getNumInputReplicas();
|
||||
int getNumExpectedBwdMsgs();
|
||||
int getNumExpectedFwdMsgs();
|
||||
int getReplicaIdx();
|
||||
int getActivePassPeriod();
|
||||
int getNumGradProducersNext();
|
||||
virtual ConvNet& getConvNet();
|
||||
};
|
||||
|
||||
class TwoDLayerInterface {
|
||||
protected:
|
||||
int _channels, _imgSize, _imgPixels;
|
||||
public:
|
||||
TwoDLayerInterface(PyObject* paramsDict);
|
||||
};
|
||||
|
||||
class NeuronLayer : public Layer {
|
||||
protected:
|
||||
Neuron* _neuron;
|
||||
std::string _neuronType;
|
||||
|
||||
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
virtual bool bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
class CrossEntLogisticGradientOperator {
|
||||
private:
|
||||
float _coeff, _posWeight;
|
||||
public:
|
||||
CrossEntLogisticGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
|
||||
}
|
||||
__device__ inline float operator()(const float y, const float t) const {
|
||||
return _coeff * (_posWeight * t * (1.0f - y) + (t - 1.0f) * y);
|
||||
}
|
||||
};
|
||||
NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
~NeuronLayer();
|
||||
std::string& getNeuronType();
|
||||
};
|
||||
|
||||
class WeightLayer : public Layer {
|
||||
protected:
|
||||
WeightList* _weights;
|
||||
Weights *_biases;
|
||||
NVMatrix _norm2;
|
||||
float _wStep, _bStep;
|
||||
int _weightUpdatePassPeriod;
|
||||
void fpropCommon(PASS_TYPE passType);
|
||||
void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType);
|
||||
virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0;
|
||||
virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) = 0;
|
||||
virtual void _constrainWeights();
|
||||
virtual float getGradScale(int inpIdx, PASS_TYPE passType);
|
||||
virtual float getIncScale(int inpIdx, PASS_TYPE passType);
|
||||
virtual float getBGradScale(PASS_TYPE passType);
|
||||
virtual float getBIncScale();
|
||||
virtual NVMatrix& getGradTarget(int inpIdx);
|
||||
NVMatrix& getWeightMatrix(PASS_TYPE passType, int inpIdx);
|
||||
NVMatrix& getBiasMatrix(PASS_TYPE passType);
|
||||
public:
|
||||
WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad);
|
||||
virtual ~WeightLayer();
|
||||
virtual bool updateWeights();
|
||||
virtual bool constrainWeights();
|
||||
virtual void copyToCPU();
|
||||
virtual void copyToGPU();
|
||||
virtual void checkGradient();
|
||||
Weights& getWeights(int idx);
|
||||
void addReplica(Layer& l);
|
||||
virtual bool postInit();
|
||||
};
|
||||
|
||||
class FCLayer : public WeightLayer {
|
||||
protected:
|
||||
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType);
|
||||
virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
|
||||
virtual void _constrainWeights();
|
||||
public:
|
||||
FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
|
||||
FCLayer();
|
||||
};
|
||||
|
||||
class SplitFCLayer : public FCLayer {
|
||||
protected:
|
||||
int _numParts;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
// void bpropBiases(NVMatrix& v, PASS_TYPE passType);
|
||||
void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
|
||||
void splitWeights();
|
||||
public:
|
||||
SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
|
||||
};
|
||||
|
||||
class SoftmaxLayer : public Layer {
|
||||
protected:
|
||||
bool _doUpperGrad;
|
||||
NVMatrix _max, _sum;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
void setDoUpperGrad(bool b);
|
||||
};
|
||||
|
||||
class ConcatenationLayer : public Layer {
|
||||
protected:
|
||||
intv* _copyOffsets;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
virtual ~ConcatenationLayer();
|
||||
};
|
||||
|
||||
class PassThroughLayer : public Layer {
|
||||
protected:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
virtual bool postInit();
|
||||
};
|
||||
|
||||
class EltwiseSumLayer : public Layer {
|
||||
protected:
|
||||
floatv* _coeffs;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
~EltwiseSumLayer();
|
||||
};
|
||||
|
||||
class EltwiseMaxLayer : public Layer {
|
||||
protected:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class SumLayer : public Layer {
|
||||
protected:
|
||||
int _stride;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
SumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class DataCopyMessage {
|
||||
public:
|
||||
enum MESSAGE_TYPE {
|
||||
COPY,
|
||||
EXIT
|
||||
};
|
||||
protected:
|
||||
CPUData* _cpuData;
|
||||
int _passIdx;
|
||||
bool _other;
|
||||
DataCopyMessage::MESSAGE_TYPE _type;
|
||||
DataCopyMessage(DataCopyMessage::MESSAGE_TYPE type) : _cpuData(NULL), _other(false), _passIdx(0), _type(type) {
|
||||
}
|
||||
public:
|
||||
DataCopyMessage(CPUData& cpuData, bool other, int passIdx) : _cpuData(&cpuData), _other(other), _passIdx(passIdx), _type(DataCopyMessage::COPY) {
|
||||
}
|
||||
|
||||
CPUData& getData() const {
|
||||
return *_cpuData;
|
||||
}
|
||||
|
||||
int getPassIdx() const {
|
||||
return _passIdx;
|
||||
}
|
||||
|
||||
bool isOther() const {
|
||||
return _other;
|
||||
}
|
||||
|
||||
DataCopyMessage::MESSAGE_TYPE getType() {
|
||||
return _type;
|
||||
}
|
||||
};
|
||||
|
||||
class DataCopyExitMessage : public DataCopyMessage {
|
||||
public:
|
||||
DataCopyExitMessage() : DataCopyMessage(DataCopyMessage::EXIT) {
|
||||
}
|
||||
};
|
||||
|
||||
class DataCopyThread;
|
||||
|
||||
class DataLayer : public Layer {
|
||||
protected:
|
||||
bool _useBuffer;
|
||||
int _dataIdx;
|
||||
ConvNet* _convNet;
|
||||
// std::map<int, NVMatrix*> _outputs2; // Buffer for copying data during computation
|
||||
std::map<int, MemoryView*> _memSrcActs2; // // Buffer for copying data during computation
|
||||
std::map<int, cudaStream_t> _copyStreams;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
Queue<int> _copyFinishQueue;
|
||||
DataCopyThread* _copier;
|
||||
bool _outstandingCopyRequest;
|
||||
int _start, _end;
|
||||
|
||||
public:
|
||||
void fprop(PASS_TYPE passType, int passIdx, bool fromBuffer);
|
||||
DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID);
|
||||
~DataLayer();
|
||||
NVMatrix& getActs(int deviceID);
|
||||
// NVMatrix& getActs(int deviceID, bool other);
|
||||
NVMatrix& getActs(int deviceID, bool other, int numCases);
|
||||
bool isGradProducer();
|
||||
void toggleBuffer(int passIdx);
|
||||
void copyData(CPUData& data, bool other, int passIdx);
|
||||
bool postInit();
|
||||
ConvNet& getConvNet();
|
||||
int getNumInputReplicas();
|
||||
cudaStream_t getCopyStream(int deviceID);
|
||||
Queue<int>& getCopyFinishQueue() {
|
||||
return _copyFinishQueue;
|
||||
}
|
||||
void waitForCopyFinish();
|
||||
int getDataIdx() const {
|
||||
return _dataIdx;
|
||||
}
|
||||
int getStart() const {
|
||||
return _start;
|
||||
}
|
||||
int getEnd() const {
|
||||
return _end;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class DataCopyThread : public Thread {
|
||||
protected:
|
||||
DataLayer* _parent;
|
||||
Queue<DataCopyMessage*> _queue;
|
||||
HostNVMatrix _hostMemFwd;
|
||||
Timer _requestTimer;
|
||||
int _sleepUsec;
|
||||
virtual void* run();
|
||||
|
||||
public:
|
||||
DataCopyThread(DataLayer& parent, intv& cpus);
|
||||
Queue<DataCopyMessage*>& getQueue();
|
||||
void stop();
|
||||
};
|
||||
|
||||
|
||||
class LocalLayer : public WeightLayer {
|
||||
protected:
|
||||
intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups;
|
||||
intv* _imgPixels, *_filterPixels, *_filterChannels;
|
||||
int _modulesX, _modules, _numFilters;
|
||||
|
||||
public:
|
||||
LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
|
||||
virtual ~LocalLayer();
|
||||
};
|
||||
|
||||
class ConvLayer : public LocalLayer {
|
||||
protected:
|
||||
int _sumWidth;
|
||||
bool _sharedBiases;
|
||||
floatv* _weightContrastNormMin, *_weightContrastNormMax;
|
||||
NVMatrix _weightGradTmp;
|
||||
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
void bpropBiases(NVMatrix& v, PASS_TYPE passType);
|
||||
void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
|
||||
void truncBwdActs();
|
||||
void _constrainWeights();
|
||||
|
||||
public:
|
||||
ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
virtual ~ConvLayer();
|
||||
};
|
||||
|
||||
class LocalUnsharedLayer : public LocalLayer {
|
||||
protected:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
void bpropBiases(NVMatrix& v, PASS_TYPE passType);
|
||||
void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
|
||||
void _constrainWeights();
|
||||
public:
|
||||
LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class PoolLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
int _sizeX, _start, _stride, _outputsX;
|
||||
std::string _pool;
|
||||
public:
|
||||
PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
|
||||
|
||||
static PoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class AvgPoolLayer : public PoolLayer {
|
||||
protected:
|
||||
bool _sum;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class MaxPoolLayer : public PoolLayer {
|
||||
protected:
|
||||
bool _abs;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs);
|
||||
};
|
||||
|
||||
class CrossMapPoolLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
int _size, _start, _stride, _outputs;
|
||||
std::string _pool;
|
||||
public:
|
||||
CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
|
||||
|
||||
static CrossMapPoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class CrossMapMaxPoolLayer : public CrossMapPoolLayer {
|
||||
protected:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class RandomScaleLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
int _tgtSize, _minScaledSize;
|
||||
float _maxScale; // should be >= 1
|
||||
NVMatrix _rescaledActs;
|
||||
std::vector<double> _scaleProbs;
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
|
||||
RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class CropLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
int _tgtSize, _startX, _startY;
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
|
||||
CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class NailbedLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
int _start, _stride, _outputsX;
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
|
||||
NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class GaussianBlurLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
Matrix* _hFilter;
|
||||
NVMatrix _filter;
|
||||
NVMatrix _actGradsTmp;
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
void copyToGPU();
|
||||
|
||||
GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
~GaussianBlurLayer();
|
||||
};
|
||||
|
||||
class HorizontalReflectionLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
|
||||
HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class ResizeLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
float _scale;
|
||||
int _tgtSize;
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
|
||||
ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class DropoutLayer : public Layer {
|
||||
protected:
|
||||
bool _enable;
|
||||
float _keep;
|
||||
NVMatrix _keepMask;
|
||||
public:
|
||||
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
void truncBwdActs();
|
||||
DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
class DropoutSmallerThanOperator {
|
||||
private:
|
||||
float _keep, _scale;
|
||||
public:
|
||||
DropoutSmallerThanOperator(float keep) : _keep(keep), _scale(1.0f/keep) {
|
||||
}
|
||||
__device__ inline float operator()(const float x) const {
|
||||
return (x < _keep) * _scale;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
class Dropout2Layer : public DropoutLayer {
|
||||
protected:
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class RGBToYUVLayer : public Layer {
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
|
||||
RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class RGBToLABLayer : public Layer {
|
||||
protected:
|
||||
bool _center;
|
||||
public:
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
|
||||
RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class ResponseNormLayer : public Layer, public TwoDLayerInterface {
|
||||
protected:
|
||||
int _size;
|
||||
float _scale, _pow;
|
||||
float _minDiv;
|
||||
NVMatrix _denoms;
|
||||
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
void truncBwdActs();
|
||||
public:
|
||||
ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class CrossMapResponseNormLayer : public ResponseNormLayer {
|
||||
protected:
|
||||
bool _blocked;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class ContrastNormLayer : public ResponseNormLayer {
|
||||
protected:
|
||||
NVMatrix _meanDiffs;
|
||||
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
void truncBwdActs();
|
||||
public:
|
||||
ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class CostLayer : public Layer {
|
||||
protected:
|
||||
float _coeff;
|
||||
doublev _costv;
|
||||
NVMatrix _tmpbuf; // For error accumulation
|
||||
int _numCases; // number of cases that the values in _costv were computed on
|
||||
bool _aggregated;
|
||||
void fpropCommon(PASS_TYPE passType);
|
||||
public:
|
||||
CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
|
||||
void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
|
||||
bool fprop(PASS_TYPE passType, int passIdx);
|
||||
|
||||
int getNumCases();
|
||||
virtual doublev& getCost();
|
||||
float getCoeff();
|
||||
bool isGradProducer();
|
||||
void setSendTerminalMessages(bool send);
|
||||
void resetPassIdx();
|
||||
|
||||
static CostLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID);
|
||||
};
|
||||
|
||||
/*
|
||||
* Input 0: labels
|
||||
* Input 1: softmax outputs
|
||||
*/
|
||||
class CrossEntCostLayer : public CostLayer {
|
||||
protected:
|
||||
NVMatrix _trueLabelLogProbs, _correctProbs;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
/*
|
||||
* Input 0: labels
|
||||
* Input 1: softmax outputs
|
||||
*/
|
||||
class LogregCostLayer : public CostLayer {
|
||||
protected:
|
||||
NVMatrix _trueLabelLogProbs, _correctProbs, _topkProbs;
|
||||
std::map<int,NVMatrix*> _probsAccum; // input replica idx -> nvmatrix
|
||||
NVMatrix _maxProbs;
|
||||
std::map<int,int> _numAccumed; // input replica idx -> int
|
||||
int _topk;
|
||||
bool _doCompute;
|
||||
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
NVMatrix& getProbsAccum(int replicaIdx);
|
||||
};
|
||||
|
||||
/*
|
||||
* Input 0: labels
|
||||
* Input 1: logistic outputs
|
||||
*/
|
||||
class BinomialCrossEntropyCostLayer : public CostLayer {
|
||||
protected:
|
||||
bool _computeSoftmaxErrorRate;
|
||||
NVMatrix _tmpProbs, _tmpVec, _correctProbs;
|
||||
float _posWeight;
|
||||
virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
float getPosWeight();
|
||||
|
||||
// Only for use with non-logistic units
|
||||
class BinomialCrossEntGradientOperator {
|
||||
private:
|
||||
float _coeff, _posWeight;
|
||||
public:
|
||||
BinomialCrossEntGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
|
||||
}
|
||||
__device__ inline float operator()(const float t, const float y) const {
|
||||
return _coeff * (_posWeight * __fdividef(t, y) + __fdividef(t - 1.0f, 1.0f - y));
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
* Input 0: labels
|
||||
* Input 1: logistic outputs
|
||||
*/
|
||||
class DetectionCrossEntropyCostLayer : public BinomialCrossEntropyCostLayer {
|
||||
protected:
|
||||
Matrix _hNumPositive, _hNumTruePositive, _hNumDeclaredPositive;
|
||||
NVMatrix _numPositive, _numTrueNegative, _numTruePositive, _numDeclaredPositive;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
public:
|
||||
DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
class SumOfSquaresCostLayer : public CostLayer {
|
||||
protected:
|
||||
NVMatrix _tmp;
|
||||
void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
|
||||
void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
|
||||
public:
|
||||
SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
|
||||
};
|
||||
|
||||
#endif /* LAYER_CUH */
|
||||
|
@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef LAYER_KERNELS_CUH
|
||||
#define LAYER_KERNELS_CUH
|
||||
|
||||
#include <vector>
|
||||
#include <helper_cuda.h>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
|
||||
#define LOGREG_GRAD_THREADS_X 32
|
||||
#define LOGREG_GRAD_THREADS_Y 4
|
||||
|
||||
#define LOGREG_ERR_THREADS_X 128
|
||||
#define LOGREG_ERR_THREADS_Y 1
|
||||
|
||||
__device__ inline float safelog(const float x) {
|
||||
return x > 0.0f ? __logf(x) : -50.0f;
|
||||
}
|
||||
|
||||
// The input matrix here is the squared norm.
|
||||
// This replaces the squared norm with:
|
||||
// 1 if it is below the threshold given by norm2
|
||||
// norm/sqrt(a) otherwise -- i.e. the desired norm (not squared)
|
||||
class MaxWeightConstraintOperator {
|
||||
private:
|
||||
float _norm, _norm2;
|
||||
public:
|
||||
MaxWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
|
||||
}
|
||||
__device__ inline float operator()(const float a) const {
|
||||
return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f;
|
||||
}
|
||||
};
|
||||
|
||||
class HardWeightConstraintOperator {
|
||||
private:
|
||||
float _norm, _norm2;
|
||||
public:
|
||||
HardWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
|
||||
}
|
||||
__device__ inline float operator()(const float a) const {
|
||||
return __fdividef(_norm, sqrtf(a));
|
||||
}
|
||||
};
|
||||
|
||||
class WeightContrastNormOperator {
|
||||
private:
|
||||
float _min, _max, _scale;
|
||||
public:
|
||||
WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) {
|
||||
}
|
||||
__device__ inline float operator()(float a) const {
|
||||
a = sqrtf(a) * _scale;
|
||||
return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f;
|
||||
}
|
||||
};
|
||||
|
||||
void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
|
||||
void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
|
||||
void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad);
|
||||
|
||||
void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
|
||||
void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
|
||||
|
||||
|
||||
// Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad
|
||||
// to avoi dividing and then multiplying by quantities that may be near zero.
|
||||
void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
|
||||
void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
|
||||
void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add);
|
||||
void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
|
||||
NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize);
|
||||
#endif /* LAYER_KERNELS_CUH */
|
||||
|
74
caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh
Normal file
74
caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh
Normal file
@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef LR_CUH
|
||||
#define LR_CUH
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <helper_cuda.h>
|
||||
#include <assert.h>
|
||||
#include <Python.h>
|
||||
#include "util.cuh"
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "../../util/include/matrix.h"
|
||||
|
||||
/*
|
||||
* The maximum learning rate is _baseRate.
|
||||
* The minimum learning rate is _baseRate / _tgtFactor.
|
||||
*
|
||||
* These classes define annealing schedules that interpolate between these
|
||||
* two extrema.
|
||||
*/
|
||||
class ParameterSchedule {
|
||||
protected:
|
||||
double _baseRate;
|
||||
public:
|
||||
ParameterSchedule(double base);
|
||||
virtual double getValue(double progress);
|
||||
double getBaseValue() const;
|
||||
virtual ~ParameterSchedule();
|
||||
|
||||
static ParameterSchedule& make(PyObject* schedDict);
|
||||
};
|
||||
|
||||
class LinearParameterSchedule : public ParameterSchedule {
|
||||
protected:
|
||||
double _finalRate;
|
||||
public:
|
||||
LinearParameterSchedule(double base, double tgtFactor);
|
||||
virtual double getValue(double progress);
|
||||
};
|
||||
|
||||
class ExpParameterSchedule : public ParameterSchedule {
|
||||
protected:
|
||||
double _powBase;
|
||||
public:
|
||||
ExpParameterSchedule(double baseRate, double tgtFactor);
|
||||
virtual double getValue(double progress);
|
||||
};
|
||||
|
||||
class DiscreteExpParameterSchedule : public ParameterSchedule {
|
||||
protected:
|
||||
std::vector<double> _rates;
|
||||
public:
|
||||
DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps);
|
||||
virtual double getValue(double progress);
|
||||
};
|
||||
|
||||
|
||||
#endif /* LR_CUH */
|
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
|
||||
class MemorySource;
|
||||
|
||||
class MemoryView {
|
||||
protected:
|
||||
MemorySource* _src;
|
||||
std::string _name;
|
||||
public:
|
||||
MemoryView(MemorySource& src, std::string& name);
|
||||
~MemoryView();
|
||||
NVMatrix& getMemory(int numCases);
|
||||
NVMatrix& getMemory();
|
||||
MemorySource& getMemorySource();
|
||||
bool isParent();
|
||||
std::string& getName();
|
||||
MemoryView& clone(std::string& name);
|
||||
};
|
||||
|
||||
// Remember: PassThroughLayer, and therefore MemorySource, exists on a particular GPU.
|
||||
class MemorySource {
|
||||
protected:
|
||||
// int _inputIdx;
|
||||
NVMatrix _memory;
|
||||
int _deviceID;
|
||||
int _size;
|
||||
std::map<std::string, std::pair<int,int> > _viewRanges;
|
||||
std::map<std::string, NVMatrix*> _memoryViews; // input idx --> slice of _memory
|
||||
std::set<std::string> _truncateRequests;
|
||||
Lock _lock;
|
||||
public:
|
||||
MemorySource(int size, int deviceID);
|
||||
~MemorySource();
|
||||
NVMatrix& getMemory(std::string& name, int numCases);
|
||||
NVMatrix& getMemory(std::string& name);
|
||||
MemoryView& addUser(std::string& name, std::pair<int,int> range);
|
||||
MemoryView& addUser(std::string& name);
|
||||
std::pair<int,int> getRange(std::string& name);
|
||||
int getSize();
|
||||
bool truncate(std::string& name);
|
||||
static MemoryView& make(int size, int deviceID, std::string& parentUser);
|
||||
};
|
||||
|
128
caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh
Normal file
128
caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh
Normal file
@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MESSAGES_CUH_
|
||||
#define MESSAGES_CUH_
|
||||
|
||||
#include <string>
|
||||
#include "layer.cuh"
|
||||
|
||||
class Layer;
|
||||
|
||||
enum MESSAGES { FPROP_TERMINAL,
|
||||
BPROP_TERMINAL,
|
||||
BPROP_READY,
|
||||
FPROP_READY,
|
||||
SYNC,
|
||||
COPY_TO_CPU,
|
||||
COPY_TO_GPU,
|
||||
UPDATE_WEIGHTS,
|
||||
CONSTRAIN_WEIGHTS,
|
||||
RESET,
|
||||
RESET_PASS_IDX,
|
||||
COST_COMPUTED,
|
||||
BPROP_START,
|
||||
EXIT_CONVNET};
|
||||
|
||||
class Message {
|
||||
protected:
|
||||
MESSAGES _messageType;
|
||||
public:
|
||||
MESSAGES getType() {
|
||||
return _messageType;
|
||||
}
|
||||
virtual Message* clone() {
|
||||
return new Message(_messageType);
|
||||
}
|
||||
Message(MESSAGES messageType) : _messageType(messageType) {
|
||||
}
|
||||
virtual ~Message() {
|
||||
}
|
||||
};
|
||||
|
||||
class PropMessage : public Message {
|
||||
protected:
|
||||
Layer *_toLayer;
|
||||
PASS_TYPE _passType;
|
||||
int _passIdx;
|
||||
public:
|
||||
|
||||
Layer& getToLayer() {
|
||||
return *_toLayer;
|
||||
}
|
||||
|
||||
PASS_TYPE getPassType() {
|
||||
return _passType;
|
||||
}
|
||||
|
||||
int getPassIdx() {
|
||||
return _passIdx;
|
||||
}
|
||||
|
||||
virtual PropMessage* clone() {
|
||||
return new PropMessage(*_toLayer, _passType, _passIdx, _messageType);
|
||||
}
|
||||
|
||||
PropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx, MESSAGES msgType)
|
||||
: _toLayer(&toLayer), _passType(passType), _passIdx(passIdx), Message(msgType) {
|
||||
}
|
||||
};
|
||||
|
||||
class FpropMessage : public PropMessage {
|
||||
public:
|
||||
FpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
|
||||
: PropMessage(toLayer, passType, passIdx, FPROP_READY) {
|
||||
}
|
||||
virtual FpropMessage* clone() {
|
||||
return new FpropMessage(*_toLayer, _passType, _passIdx);
|
||||
}
|
||||
};
|
||||
|
||||
class BpropMessage : public PropMessage {
|
||||
public:
|
||||
BpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
|
||||
: PropMessage(toLayer, passType, passIdx, BPROP_READY) {
|
||||
}
|
||||
virtual BpropMessage* clone() {
|
||||
return new BpropMessage(*_toLayer, _passType, _passIdx);
|
||||
}
|
||||
};
|
||||
|
||||
class BpropStartMessage : public Message {
|
||||
protected:
|
||||
PASS_TYPE _passType;
|
||||
int _passIdx;
|
||||
public:
|
||||
PASS_TYPE getPassType() {
|
||||
return _passType;
|
||||
}
|
||||
|
||||
int getPassIdx() {
|
||||
return _passIdx;
|
||||
}
|
||||
|
||||
virtual BpropStartMessage* clone() {
|
||||
return new BpropStartMessage(_passType, _passIdx);
|
||||
}
|
||||
|
||||
BpropStartMessage(PASS_TYPE passType, int passIdx)
|
||||
: _passType(passType), Message(BPROP_START), _passIdx(passIdx) {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif /* MESSAGES_CUH_ */
|
541
caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh
Normal file
541
caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh
Normal file
@ -0,0 +1,541 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef NEURONS_CUH
|
||||
#define NEURONS_CUH
|
||||
|
||||
#include <Python.h>
|
||||
#include <assert.h>
|
||||
#include <string>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include <helper_cuda.h>
|
||||
|
||||
template <class GradientOp>
|
||||
class AddGradientBinaryOperator {
|
||||
GradientOp _op;
|
||||
public:
|
||||
AddGradientBinaryOperator(GradientOp op) : _op(op) {
|
||||
}
|
||||
__device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const {
|
||||
return _op(unitActGrad, unitAct) + target;
|
||||
}
|
||||
};
|
||||
|
||||
template <class GradientOp>
|
||||
class AddGradientOperator {
|
||||
GradientOp _op;
|
||||
public:
|
||||
AddGradientOperator(GradientOp op) : _op(op) {
|
||||
}
|
||||
__device__ inline float operator()(const float unitActGrad, const float target) const {
|
||||
return target + _op(unitActGrad);
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* Neuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = x
|
||||
* =======================
|
||||
*/
|
||||
class Neuron {
|
||||
protected:
|
||||
bool _activated;
|
||||
// Inputs and outputs potentially point to the same matrix, depending on the neuron
|
||||
NVMatrix* _inputs, *_outputs;
|
||||
virtual void _activate() {
|
||||
if (_inputs != _outputs) {
|
||||
_inputs->copy(*_outputs);
|
||||
}
|
||||
}
|
||||
virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
if (&target != &actsGrad) {
|
||||
actsGrad.copy(target);
|
||||
}
|
||||
}
|
||||
virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
if (&target != &actsGrad) {
|
||||
target.add(actsGrad);
|
||||
}
|
||||
}
|
||||
public:
|
||||
Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) {
|
||||
}
|
||||
virtual void activate(NVMatrix& inputs, NVMatrix& outputs) {
|
||||
_activated = true;
|
||||
_inputs = &inputs;
|
||||
_outputs = &outputs;
|
||||
_activate();
|
||||
}
|
||||
|
||||
virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) {
|
||||
assert(_activated);
|
||||
if (!add) {
|
||||
target.resize(actsGrad);
|
||||
_computeInputGrad(actsGrad, target);
|
||||
} else {
|
||||
_addInputGrad(actsGrad, target);
|
||||
}
|
||||
}
|
||||
|
||||
static Neuron& makeNeuron(PyObject* neuronDict);
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* LogisticNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = 1 / (1 + e^-x)
|
||||
* =======================
|
||||
*/
|
||||
class LogisticNeuron : public Neuron {
|
||||
protected:
|
||||
void _activate() {
|
||||
_inputs->apply(NVMatrixOps::Logistic(), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<LogisticGradientOperator>(LogisticGradientOperator()), *_outputs, target, target);
|
||||
}
|
||||
public:
|
||||
class LogisticGradientOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float unitActGrad, float unitAct) const {
|
||||
return unitActGrad * unitAct * (1.0f - unitAct);
|
||||
}
|
||||
};
|
||||
|
||||
LogisticNeuron() : Neuron() {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* LogNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = log(eps + x)
|
||||
* =======================
|
||||
*/
|
||||
class LogNeuron : public Neuron {
|
||||
protected:
|
||||
float _eps;
|
||||
void _activate() {
|
||||
_inputs->apply(LogOperator(_eps), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(LogGradientOperator(_eps), *_inputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<LogGradientOperator>(LogGradientOperator(_eps)), *_inputs, target, target);
|
||||
}
|
||||
public:
|
||||
class LogGradientOperator {
|
||||
protected:
|
||||
float _eps;
|
||||
public:
|
||||
__device__ inline float operator()(float unitActGrad, float unitInput) const {
|
||||
return __fdividef(unitActGrad, _eps + unitInput);
|
||||
}
|
||||
LogGradientOperator(float eps) : _eps(eps) {
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
class LogOperator {
|
||||
protected:
|
||||
float _eps;
|
||||
public:
|
||||
__device__ inline float operator()(float x) const {
|
||||
return __logf(_eps + x);
|
||||
}
|
||||
LogOperator(float eps) : _eps(eps) {
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
LogNeuron(float eps) : _eps(eps), Neuron() {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* ReluNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = max(0, x)
|
||||
* =======================
|
||||
*/
|
||||
class ReluNeuron : public Neuron {
|
||||
protected:
|
||||
virtual void _activate() {
|
||||
_inputs->apply(ReluOperator(), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<ReluGradientOperator>(ReluGradientOperator()), *_outputs, target, target);
|
||||
}
|
||||
public:
|
||||
class ReluOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float x) const {
|
||||
return x < 0.0f ? 0.0f : x;
|
||||
}
|
||||
};
|
||||
|
||||
class ReluGradientOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float unitActGrad, float unitAct) const {
|
||||
return unitActGrad * (unitAct > 0.0f);
|
||||
}
|
||||
};
|
||||
|
||||
ReluNeuron() : Neuron() {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/* =======================
|
||||
* BoundedReluNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = min(a, max(0, x))
|
||||
* =======================
|
||||
*/
|
||||
class BoundedReluNeuron : public Neuron {
|
||||
protected:
|
||||
float _a;
|
||||
|
||||
void _activate() {
|
||||
_inputs->apply(BoundedReluOperator(_a), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<BoundedReluGradientOperator>(BoundedReluGradientOperator(_a)), *_outputs, target, target);
|
||||
}
|
||||
public:
|
||||
class BoundedReluOperator {
|
||||
private:
|
||||
float _a;
|
||||
public:
|
||||
BoundedReluOperator(float a) : _a(a) {
|
||||
}
|
||||
__device__ inline float operator()(float x) const {
|
||||
return x < 0.0f ? 0.0f : x > _a ? _a : x;
|
||||
}
|
||||
};
|
||||
|
||||
class BoundedReluGradientOperator {
|
||||
private:
|
||||
float _a;
|
||||
public:
|
||||
BoundedReluGradientOperator(float a) : _a(a) {
|
||||
}
|
||||
__device__ inline float operator()(float unitActGrad, float unitAct) const {
|
||||
return unitActGrad * (unitAct > 0.0f) * (unitAct < _a);
|
||||
}
|
||||
};
|
||||
|
||||
BoundedReluNeuron(float a) : Neuron(), _a(a) {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* AbsNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = abs(x)
|
||||
* =======================
|
||||
*/
|
||||
class AbsNeuron : public Neuron {
|
||||
protected:
|
||||
void _activate() {
|
||||
assert(_inputs != _outputs);
|
||||
_inputs->apply(NVMatrixOps::Abs(), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<AbsGradientOperator>(AbsGradientOperator()), *_inputs, target, target);
|
||||
}
|
||||
public:
|
||||
class AbsGradientOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float unitActGrad, float unitInput) const {
|
||||
return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f);
|
||||
}
|
||||
};
|
||||
|
||||
AbsNeuron() : Neuron() {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* TanhNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = a*tanh(b*x)
|
||||
* =======================
|
||||
*/
|
||||
class TanhNeuron : public Neuron {
|
||||
protected:
|
||||
float _a, _b;
|
||||
|
||||
void _activate() {
|
||||
_inputs->apply(TanhOperator(_a, _b), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<TanhGradientOperator>(TanhGradientOperator(_a, _b)), *_outputs, target, target);
|
||||
}
|
||||
public:
|
||||
class TanhOperator {
|
||||
private:
|
||||
float _a, _n2b;
|
||||
public:
|
||||
TanhOperator(float a, float b) : _a(a), _n2b(-2*b) {
|
||||
}
|
||||
virtual __device__ inline float operator()(float x) const {
|
||||
return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f);
|
||||
}
|
||||
};
|
||||
|
||||
class TanhGradientOperator {
|
||||
private:
|
||||
float _b, _a;
|
||||
public:
|
||||
TanhGradientOperator(float a, float b) : _b(b), _a(a) {
|
||||
}
|
||||
__device__ inline float operator()(float unitActGrad, float unitAct) const {
|
||||
// const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f;
|
||||
// return unitActGrad * _n4ab * (t * (t - 1.0f));
|
||||
return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a));
|
||||
}
|
||||
};
|
||||
|
||||
TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* DoubleReluNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = x - a*tanh(x/a)
|
||||
* =======================
|
||||
*/
|
||||
class DoubleReluNeuron : public Neuron {
|
||||
protected:
|
||||
float _a;
|
||||
|
||||
void _activate() {
|
||||
assert(_inputs != _outputs);
|
||||
_inputs->apply(DoubleReluOperator(_a), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<DoubleReluGradientOperator>(DoubleReluGradientOperator(_a)), *_inputs, target, target);
|
||||
}
|
||||
public:
|
||||
class DoubleReluOperator {
|
||||
private:
|
||||
float _a, _n2a;
|
||||
public:
|
||||
DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) {
|
||||
}
|
||||
virtual __device__ inline float operator()(float x) const {
|
||||
return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f);
|
||||
}
|
||||
};
|
||||
|
||||
class DoubleReluGradientOperator {
|
||||
private:
|
||||
float _n2a;
|
||||
public:
|
||||
DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) {
|
||||
}
|
||||
__device__ inline float operator()(float unitActGrad, float unitInput) const {
|
||||
const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f;
|
||||
return unitActGrad * (tanh*tanh);
|
||||
}
|
||||
};
|
||||
|
||||
DoubleReluNeuron(float a) : Neuron(), _a(a) {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* SoftReluNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = log(1 + e^x)
|
||||
* =======================
|
||||
*/
|
||||
class SoftReluNeuron : public Neuron {
|
||||
protected:
|
||||
void _activate() {
|
||||
// assert(_inputs != _outputs);
|
||||
_inputs->apply(SoftReluOperator(), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(SoftReluGradientOperator(), *_outputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<SoftReluGradientOperator>(SoftReluGradientOperator()), *_outputs, target, target);
|
||||
}
|
||||
public:
|
||||
class SoftReluOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float x) const {
|
||||
// This piece-wise implementation has better numerical stability than
|
||||
// simply computing log(1 + e^x).
|
||||
return x > 4.0f ? x : __logf(1.0f + __expf(x));
|
||||
}
|
||||
};
|
||||
|
||||
class SoftReluGradientOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float unitActGrad, float unitOutput) const {
|
||||
if (unitOutput > 4.0f) {
|
||||
return unitActGrad;
|
||||
}
|
||||
const float f = __expf(-unitOutput);
|
||||
return unitActGrad * (1.0f - f);
|
||||
}
|
||||
};
|
||||
|
||||
SoftReluNeuron() : Neuron() {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* SquareNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = x^2
|
||||
* =======================
|
||||
*/
|
||||
class SquareNeuron : public Neuron {
|
||||
protected:
|
||||
void _activate() {
|
||||
assert(_inputs != _outputs);
|
||||
_inputs->apply(NVMatrixOps::Square(), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<SquareGradientOperator>(SquareGradientOperator()), *_inputs, target, target);
|
||||
}
|
||||
public:
|
||||
class SquareGradientOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float unitActGrad, float unitInput) const {
|
||||
return unitActGrad * 2.0f * unitInput;
|
||||
}
|
||||
};
|
||||
|
||||
SquareNeuron() : Neuron() {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* SqrtNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = sqrt(x)
|
||||
* =======================
|
||||
*/
|
||||
class SqrtNeuron : public Neuron {
|
||||
protected:
|
||||
void _activate() {
|
||||
_inputs->apply(NVMatrixOps::Sqrt(), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyTernary(AddGradientBinaryOperator<SqrtGradientOperator>(SqrtGradientOperator()), *_outputs, target, target);
|
||||
}
|
||||
public:
|
||||
class SqrtGradientOperator {
|
||||
public:
|
||||
__device__ inline float operator()(float unitActGrad, float unitAct) const {
|
||||
return __fdividef(unitActGrad, 2.0f * unitAct);
|
||||
}
|
||||
};
|
||||
|
||||
SqrtNeuron() : Neuron() {
|
||||
}
|
||||
};
|
||||
|
||||
/* =======================
|
||||
* LinearNeuron
|
||||
* -----------------------
|
||||
*
|
||||
* f(x) = a*x + b
|
||||
* =======================
|
||||
*/
|
||||
class LinearNeuron : public Neuron {
|
||||
protected:
|
||||
float _a, _b;
|
||||
void _activate() {
|
||||
_inputs->apply(NVMatrixOps::Linear(_a, _b), *_outputs);
|
||||
}
|
||||
|
||||
void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.scale(_a, target);
|
||||
}
|
||||
|
||||
void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
|
||||
actsGrad.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_a)), target, target);
|
||||
}
|
||||
public:
|
||||
LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
|
||||
}
|
||||
};
|
||||
#endif /* NEURONS_CUH */
|
||||
|
@ -0,0 +1,175 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef PIPEDISPENSER_CUH_
|
||||
#define PIPEDISPENSER_CUH_
|
||||
|
||||
#include <pthread.h>
|
||||
#include <set>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include "../../util/include/thread.h"
|
||||
#include "util.cuh"
|
||||
|
||||
/*
|
||||
* PipeDispenser interface
|
||||
*/
|
||||
class PipeDispenser {
|
||||
protected:
|
||||
int _numPipes;
|
||||
seti _pipes;
|
||||
pthread_mutex_t *_mutex;
|
||||
|
||||
void lock() {
|
||||
pthread_mutex_lock(_mutex);
|
||||
}
|
||||
|
||||
void unlock() {
|
||||
pthread_mutex_unlock(_mutex);
|
||||
}
|
||||
|
||||
virtual void init() {
|
||||
_mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
|
||||
pthread_mutex_init(_mutex, NULL);
|
||||
}
|
||||
public:
|
||||
PipeDispenser(const seti& pipes) {
|
||||
_pipes.insert(pipes.begin(), pipes.end());
|
||||
init();
|
||||
}
|
||||
|
||||
PipeDispenser(int numPipes) {
|
||||
for (int i = 0; i < numPipes; ++i) {
|
||||
_pipes.insert(i);
|
||||
}
|
||||
init();
|
||||
}
|
||||
|
||||
virtual ~PipeDispenser() {
|
||||
pthread_mutex_destroy(_mutex);
|
||||
free(_mutex);
|
||||
}
|
||||
|
||||
virtual int getPipe(const seti& interested) = 0;
|
||||
|
||||
int getPipe(int interested) {
|
||||
seti tmp;
|
||||
tmp.insert(interested);
|
||||
return getPipe(tmp);
|
||||
}
|
||||
|
||||
virtual void freePipe(int pipe) = 0;
|
||||
};
|
||||
|
||||
/*
|
||||
* This one blocks until there is a free pipe to return.
|
||||
*/
|
||||
class PipeDispenserBlocking : public PipeDispenser {
|
||||
protected:
|
||||
pthread_cond_t *_cv;
|
||||
|
||||
void wait() {
|
||||
pthread_cond_wait(_cv, _mutex);
|
||||
}
|
||||
|
||||
void broadcast() {
|
||||
pthread_cond_broadcast(_cv);
|
||||
}
|
||||
|
||||
int getAvailablePipes(const seti& interested, intv& available) {
|
||||
available.clear();
|
||||
std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available));
|
||||
return available.size();
|
||||
}
|
||||
|
||||
virtual void init() {
|
||||
PipeDispenser::init();
|
||||
_cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
|
||||
pthread_cond_init(_cv, NULL);
|
||||
}
|
||||
public:
|
||||
PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) {
|
||||
init();
|
||||
}
|
||||
|
||||
PipeDispenserBlocking(int numPipes) : PipeDispenser(numPipes) {
|
||||
init();
|
||||
}
|
||||
|
||||
~PipeDispenserBlocking() {
|
||||
pthread_cond_destroy(_cv);
|
||||
free(_cv);
|
||||
}
|
||||
|
||||
int getPipe(const seti& interested) {
|
||||
lock();
|
||||
intv avail;
|
||||
while (getAvailablePipes(interested, avail) == 0) {
|
||||
wait();
|
||||
}
|
||||
int pipe = avail[0];
|
||||
_pipes.erase(pipe);
|
||||
unlock();
|
||||
return pipe;
|
||||
}
|
||||
|
||||
void freePipe(int pipe) {
|
||||
lock();
|
||||
_pipes.insert(pipe);
|
||||
broadcast();
|
||||
unlock();
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* This one returns the least-occupied pipe.
|
||||
*/
|
||||
class PipeDispenserNonBlocking : public PipeDispenser {
|
||||
protected:
|
||||
std::map<int,int> _pipeUsers;
|
||||
|
||||
public:
|
||||
PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) {
|
||||
for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) {
|
||||
_pipeUsers[*it] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int getPipe(const seti& interested) {
|
||||
lock();
|
||||
int pipe = -1, users = 1 << 30;
|
||||
for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) {
|
||||
if (interested.count(*it) > 0 && _pipeUsers[*it] < users) {
|
||||
pipe = *it;
|
||||
users = _pipeUsers[*it];
|
||||
}
|
||||
}
|
||||
if (pipe >= 0) {
|
||||
_pipeUsers[pipe]++;
|
||||
}
|
||||
unlock();
|
||||
return pipe;
|
||||
}
|
||||
|
||||
void freePipe(int pipe) {
|
||||
lock();
|
||||
_pipeUsers[pipe]--;
|
||||
unlock();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#endif /* PIPEDISPENSER_CUH_ */
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef PYCONVNET3_CUH
|
||||
#define PYCONVNET3_CUH
|
||||
|
||||
#define _QUOTEME(x) #x
|
||||
#define QUOTEME(x) _QUOTEME(x)
|
||||
|
||||
extern "C" void init_ConvNet();
|
||||
|
||||
PyObject* initModel(PyObject *self, PyObject *args);
|
||||
PyObject* startBatch(PyObject *self, PyObject *args);
|
||||
PyObject* finishBatch(PyObject *self, PyObject *args);
|
||||
PyObject* checkGradients(PyObject *self, PyObject *args);
|
||||
PyObject* syncWithHost(PyObject *self, PyObject *args);
|
||||
PyObject* startMultiviewTest(PyObject *self, PyObject *args);
|
||||
PyObject* startFeatureWriter(PyObject *self, PyObject *args);
|
||||
PyObject* startDataGrad(PyObject *self, PyObject *args);
|
||||
PyObject* decodeJpeg(PyObject *self, PyObject *args);
|
||||
|
||||
#endif
|
@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef REDUCEPIPELINE_CUH_H_
|
||||
#define REDUCEPIPELINE_CUH_H_
|
||||
|
||||
#include "../../util/include/thread.h"
|
||||
#include "../../util/include/queue.h"
|
||||
#include <helper_cuda.h>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "util.cuh"
|
||||
|
||||
#define REDUCE_MIN_CHUNK_SIZE (1<<18) // 256k
|
||||
#define REDUCE_MAX_CHUNKS 16
|
||||
#define REDUCE_MIN_CHUNKS 2
|
||||
|
||||
enum REDUCE_MESSAGE_TYPE {
|
||||
REDUCE_CHUNK,
|
||||
REDUCE_START,
|
||||
EXIT
|
||||
};
|
||||
|
||||
class ReducePeer;
|
||||
class ReducerSource;
|
||||
class IReduceSegment;
|
||||
class IEightGPUReducer;
|
||||
|
||||
class ReduceMessage {
|
||||
protected:
|
||||
REDUCE_MESSAGE_TYPE _msgType;
|
||||
float _scaleIntermediates, _scaleTarget;
|
||||
std::map<int,NVMatrix*>* _mats;
|
||||
public:
|
||||
ReduceMessage(REDUCE_MESSAGE_TYPE msgType, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
|
||||
: _msgType(msgType), _scaleIntermediates(scaleIntermediates), _scaleTarget(scaleTarget), _mats(&mats) {
|
||||
}
|
||||
ReduceMessage(REDUCE_MESSAGE_TYPE msgType)
|
||||
: _msgType(msgType), _scaleIntermediates(0), _scaleTarget(0), _mats(NULL) {
|
||||
}
|
||||
inline REDUCE_MESSAGE_TYPE getType() const {
|
||||
return _msgType;
|
||||
}
|
||||
inline float getScaleIntermediates() const {
|
||||
return _scaleIntermediates;
|
||||
}
|
||||
inline float getScaleTarget() const {
|
||||
return _scaleTarget;
|
||||
}
|
||||
inline NVMatrix& getMatrix(int deviceID) const {
|
||||
return *_mats->at(deviceID);
|
||||
}
|
||||
inline std::map<int,NVMatrix*>& getMatrices() const {
|
||||
return *_mats;
|
||||
}
|
||||
};
|
||||
|
||||
class ReduceChunkMessage : public ReduceMessage {
|
||||
protected:
|
||||
int _chunkIdx;
|
||||
int _chunkSize;
|
||||
int _numChunks;
|
||||
|
||||
IReduceSegment* _src;
|
||||
public:
|
||||
ReduceChunkMessage(IReduceSegment& src, int chunkIdx, int chunkSize, int numChunks, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
|
||||
: _src(&src), _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks),
|
||||
ReduceMessage(REDUCE_CHUNK, scaleIntermediates, scaleTarget, mats) {
|
||||
}
|
||||
|
||||
inline int getChunkIdx() const {
|
||||
return _chunkIdx;
|
||||
}
|
||||
|
||||
inline int getChunkSize() const {
|
||||
return _chunkSize;
|
||||
}
|
||||
|
||||
inline int getNumChunks() const {
|
||||
return _numChunks;
|
||||
}
|
||||
|
||||
inline IReduceSegment& getSource() const {
|
||||
return *_src;
|
||||
}
|
||||
};
|
||||
|
||||
class ReduceStartMessage : public ReduceMessage {
|
||||
public:
|
||||
ReduceStartMessage(float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
|
||||
: ReduceMessage(REDUCE_START, scaleIntermediates, scaleTarget, mats) {
|
||||
}
|
||||
};
|
||||
|
||||
class IReduceSegment : public Thread {
|
||||
protected:
|
||||
int _deviceID;
|
||||
std::vector<IReduceSegment*> _prev;
|
||||
ReducePeer* _next;
|
||||
Queue<ReduceMessage*> _queue;
|
||||
Queue<int>* _finishQueue;
|
||||
|
||||
NVMatrix& getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx);
|
||||
void* run();
|
||||
virtual bool processMessage(ReduceMessage& msg) = 0;
|
||||
|
||||
public:
|
||||
IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
|
||||
virtual ~IReduceSegment();
|
||||
inline virtual NVMatrix& getMatrix(ReduceMessage& msg);
|
||||
Queue<ReduceMessage*>& getQueue();
|
||||
int getDeviceID() const;
|
||||
void addPrev(IReduceSegment& c);
|
||||
void addNext(ReducePeer& c);
|
||||
bool isTerminal() const;
|
||||
};
|
||||
|
||||
class ReducerSource : public IReduceSegment {
|
||||
protected:
|
||||
bool processMessage(ReduceMessage& msg);
|
||||
public:
|
||||
ReducerSource(IEightGPUReducer& parent, int deviceID);
|
||||
};
|
||||
|
||||
class ReducePeer : public IReduceSegment {
|
||||
protected:
|
||||
std::map<int,cudaStream_t> _streams; // device id -> stream
|
||||
std::map<int,int> _numInputsReceived; // chunk idx -> num inputs
|
||||
int _numInputsFinished;
|
||||
HostNVMatrix _mat;
|
||||
bool _add;
|
||||
bool processMessage(ReduceMessage& msg);
|
||||
inline cudaStream_t getStream(int deviceID);
|
||||
inline NVMatrix& getMatrix(ReduceMessage& msg);
|
||||
void hostAdd(const float* src, float* tgt, const int n, const float scaleTgt);
|
||||
public:
|
||||
ReducePeer(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
|
||||
ReducePeer(IEightGPUReducer& parent);
|
||||
~ReducePeer();
|
||||
};
|
||||
|
||||
class IEightGPUReducer {
|
||||
protected:
|
||||
std::vector<ReducerSource*> _sources;
|
||||
std::vector<ReducePeer*> _peers;
|
||||
Queue<int> _finishQueue;
|
||||
int _tgtDeviceID;
|
||||
virtual void makeConnections(std::vector<int>& same, std::vector<int>&other) = 0;
|
||||
public:
|
||||
IEightGPUReducer(int tgtDeviceID);
|
||||
virtual ~IEightGPUReducer();
|
||||
IEightGPUReducer& construct();
|
||||
void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates, float scaleTarget);
|
||||
void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates);
|
||||
void reduce(std::map<int, NVMatrix*>& mats);
|
||||
int getTgtDeviceID() const;
|
||||
};
|
||||
|
||||
class EightGPUReducer1 : public IEightGPUReducer {
|
||||
protected:
|
||||
void makeConnections(std::vector<int>& same, std::vector<int>&other);
|
||||
public:
|
||||
EightGPUReducer1(int tgtDeviceID);
|
||||
};
|
||||
|
||||
class EightGPUReducer2 : public IEightGPUReducer {
|
||||
protected:
|
||||
void makeConnections(std::vector<int>& same, std::vector<int>&other);
|
||||
public:
|
||||
EightGPUReducer2(int tgtDeviceID);
|
||||
};
|
||||
|
||||
#endif /* REDUCEPIPELINE_CUH_H_ */
|
@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef STREAMBROADCAST_CUH_
|
||||
#define STREAMBROADCAST_CUH_
|
||||
|
||||
#include <iostream>
|
||||
#include "../../util/include/queue.h"
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "util.cuh"
|
||||
|
||||
class Layer;
|
||||
|
||||
//#define NUM_STREAM_COPY_PARTS 4
|
||||
// This is in 4-byte words, not bytes
|
||||
#define SB_MIN_CHUNK_SIZE (1<<17)
|
||||
#define SB_MAX_CHUNKS 16
|
||||
|
||||
class StreamBroadcast {
|
||||
protected:
|
||||
std::map<int,cudaStream_t> _streams;
|
||||
std::set<int> _ownedStreams;
|
||||
HostNVMatrix _hostMem;
|
||||
void toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice);
|
||||
void toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput);
|
||||
void init(std::map<int,cudaStream_t>& streams);
|
||||
void init(std::map<int,NVMatrix*>& mats);
|
||||
public:
|
||||
StreamBroadcast(std::map<int,cudaStream_t>& streams);
|
||||
StreamBroadcast();
|
||||
virtual ~StreamBroadcast();
|
||||
|
||||
void transfer(std::map<int,NVMatrix*>& mats, HostNVMatrix& hostmem, int srcDevice, float scaleTarget, float scaleOutput);
|
||||
void transfer(std::map<int,NVMatrix*>& mats, int srcDevice, float scaleTarget, float scaleOutput);
|
||||
void transfer(std::map<int,NVMatrix*>& mats, int srcDevice);
|
||||
void sync(int deviceID);
|
||||
cudaStream_t getStream(int deviceID);
|
||||
};
|
||||
|
||||
#endif /* STREAMBROADCAST_CUH_ */
|
52
caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh
Normal file
52
caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh
Normal file
@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef TIMER_CC_H_
|
||||
#define TIMER_CC_H_
|
||||
|
||||
#include <helper_timer.h>
|
||||
|
||||
class Timer {
|
||||
protected:
|
||||
StopWatchInterface* _timer;
|
||||
bool _started;
|
||||
|
||||
public:
|
||||
Timer() : _started(false) {
|
||||
sdkCreateTimer(&_timer);
|
||||
}
|
||||
|
||||
~Timer() {
|
||||
sdkDeleteTimer(&_timer);
|
||||
}
|
||||
inline void start () {
|
||||
_started = true;
|
||||
sdkResetTimer(&_timer);
|
||||
sdkStartTimer(&_timer);
|
||||
}
|
||||
|
||||
inline double stop() {
|
||||
sdkStopTimer(&_timer);
|
||||
_started = false;
|
||||
return sdkGetTimerValue(&_timer);
|
||||
}
|
||||
|
||||
inline bool isStarted() const {
|
||||
return _started;
|
||||
}
|
||||
};
|
||||
|
||||
#endif /* TIMER_CC_H_ */
|
130
caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh
Normal file
130
caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh
Normal file
@ -0,0 +1,130 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef UTIL_H
|
||||
#define UTIL_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <Python.h>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "../../util/include/matrix.h"
|
||||
|
||||
|
||||
#define PASS_TYPE uint
|
||||
#define PASS_TRAIN 0x1
|
||||
#define PASS_TEST 0x2
|
||||
#define PASS_GC 0x4
|
||||
#define PASS_MULTIVIEW_TEST (PASS_TEST | 0x8)
|
||||
#define PASS_MULTIVIEW_TEST_START (PASS_MULTIVIEW_TEST | 0x10)
|
||||
#define PASS_MULTIVIEW_TEST_END (PASS_MULTIVIEW_TEST | 0x20)
|
||||
#define PASS_FEATURE_GEN 0x40
|
||||
|
||||
#define HAS_FLAG(f, x) (((x) & (f)) == (f))
|
||||
#define IS_MULTIVIEW_TEST(x) HAS_FLAG(PASS_MULTIVIEW_TEST, x)
|
||||
#define IS_MULTIVIEW_TEST_START(x) HAS_FLAG(PASS_MULTIVIEW_TEST_START, x)
|
||||
#define IS_MULTIVIEW_TEST_END(x) HAS_FLAG(PASS_MULTIVIEW_TEST_END, x)
|
||||
#define IS_TEST(x) HAS_FLAG(PASS_TEST, x)
|
||||
#define IS_TRAIN(x) HAS_FLAG(PASS_TRAIN, x)
|
||||
|
||||
// For gradient checking
|
||||
#define GC_SUPPRESS_PASSES false
|
||||
#define GC_REL_ERR_THRESH 0.02
|
||||
|
||||
#ifdef DO_PRINT
|
||||
#define PRINT(x, args...) printf(x, ## args);
|
||||
#else
|
||||
#define PRINT(x, args...) ;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Generates a random floating point number in the range 0-1.
|
||||
*/
|
||||
#define randf ((float)rand() / RAND_MAX)
|
||||
|
||||
//typedef std::vector<Matrix*> MatrixV;
|
||||
//typedef std::vector<NVMatrix*> NVMatrixV;
|
||||
typedef std::map<std::string,std::vector<double>*> CostMap;
|
||||
typedef std::map<std::string,double> CostCoeffMap;
|
||||
typedef std::vector<double> doublev;
|
||||
typedef std::vector<float> floatv;
|
||||
typedef std::vector<int> intv;
|
||||
typedef std::vector<std::string> stringv;
|
||||
typedef std::set<int> seti;
|
||||
typedef std::vector<PyObject*> PyObjectV;
|
||||
|
||||
stringv* getStringV(PyObject* pyList);
|
||||
floatv* getFloatV(PyObject* pyList);
|
||||
intv* getIntV(PyObject* pyList);
|
||||
MatrixV* getMatrixV(PyObject* pyList);
|
||||
MatrixV* getMatrixV(PyObject* pyList, int len);
|
||||
int* getIntA(PyObject* pyList);
|
||||
|
||||
int pyDictGetInt(PyObject* dict, const char* key);
|
||||
intv* pyDictGetIntV(PyObject* dict, const char* key);
|
||||
std::string pyDictGetString(PyObject* dict, const char* key);
|
||||
float pyDictGetFloat(PyObject* dict, const char* key);
|
||||
floatv* pyDictGetFloatV(PyObject* dict, const char* key);
|
||||
Matrix* pyDictGetMatrix(PyObject* dict, const char* key);
|
||||
MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key);
|
||||
int* pyDictGetIntA(PyObject* dict, const char* key);
|
||||
stringv* pyDictGetStringV(PyObject* dict, const char* key);
|
||||
bool pyDictHasKey(PyObject* dict, const char* key);
|
||||
PyObjectV* pyDictGetValues(PyObject* dict);
|
||||
|
||||
template<typename T> std::string tostr(T n);
|
||||
template<typename T> void shuffleVector(std::vector<T>& v, int start, int end);
|
||||
template<class T> void deleteElements(std::vector<T*>& v);
|
||||
template<class T> void deleteElements(std::vector<T*>& v, bool deleteContainer);
|
||||
|
||||
template<class T>
|
||||
int indexOf(std::vector<T>& v, T e) {
|
||||
int i = 0;
|
||||
// typename vector<T>::iterator it2 = v.begin();
|
||||
for (typename std::vector<T>::const_iterator it = v.begin(); it != v.end(); ++it) {
|
||||
if (*it == e) {
|
||||
return i;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<int>& getDeviceCPUs(int deviceID);
|
||||
|
||||
template<typename K, typename V> std::set<K> getKeys(std::map<K,V>& m) {
|
||||
std::set<K> s;
|
||||
for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it) {
|
||||
s.insert(it->first);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
struct LayerIDComparator {
|
||||
bool operator()(PyObject* i, PyObject* j) {
|
||||
return pyDictGetInt(i, "id") < pyDictGetInt(j, "id");
|
||||
}
|
||||
};
|
||||
|
||||
#endif /* UTIL_H */
|
||||
|
159
caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh
Normal file
159
caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh
Normal file
@ -0,0 +1,159 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef WEIGHTS_CUH
|
||||
#define WEIGHTS_CUH
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <helper_cuda.h>
|
||||
#include <assert.h>
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "../../util/include/matrix.h"
|
||||
#include "util.cuh"
|
||||
#include "lr.cuh"
|
||||
#include "layer.cuh"
|
||||
#include "copypipeline.cuh"
|
||||
#include "reducepipeline.cuh"
|
||||
#include "streambroadcast.cuh"
|
||||
|
||||
class Layer;
|
||||
class Weights;
|
||||
class StreamBroadcast;
|
||||
|
||||
class IWeightReducer {
|
||||
protected:
|
||||
int _tgtReplicaID;
|
||||
std::map<int,Weights*> _replicas;
|
||||
|
||||
int getDeviceID();
|
||||
public:
|
||||
IWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
|
||||
virtual ~IWeightReducer();
|
||||
static IWeightReducer& make(std::map<int,Weights*>& replicas, int srcReplicaID);
|
||||
virtual void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) = 0;
|
||||
};
|
||||
|
||||
class SequentialWeightReducer : public IWeightReducer {
|
||||
protected:
|
||||
StreamBroadcast* _sb;
|
||||
public:
|
||||
SequentialWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
|
||||
~SequentialWeightReducer();
|
||||
void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
|
||||
};
|
||||
|
||||
class ParallelWeightReducer : public IWeightReducer {
|
||||
protected:
|
||||
IEightGPUReducer* _reducer;
|
||||
public:
|
||||
ParallelWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
|
||||
~ParallelWeightReducer();
|
||||
void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
|
||||
};
|
||||
|
||||
class Weights {
|
||||
protected:
|
||||
Matrix* _hWeights, *_hWeightsInc;
|
||||
NVMatrix* _weights, *_weightsInc, *_weightsGrad;
|
||||
|
||||
ParameterSchedule* _lrs;
|
||||
|
||||
float _wc, _mom, _wball;
|
||||
bool _onGPU, _useGrad, _cleanup;
|
||||
int _numUpdates;
|
||||
|
||||
// Note: every layer is its own sibling too
|
||||
std::map<int,Weights*> _replicas;
|
||||
|
||||
// Non-NULL if these weights are really shared from some other layer
|
||||
Weights* _srcWeights;
|
||||
Layer* _parent;
|
||||
int _shardSize;
|
||||
IWeightReducer* _reducer;
|
||||
ISafeBroadcastNetwork* _broadcaster;
|
||||
|
||||
void aggregateReplicaGradients(float progress);
|
||||
|
||||
// TODO: assert that these retrun contiguous views
|
||||
template<class T> T& getShard(T& mat, int replicaID);
|
||||
template<class T> T& getShard(T& mat);
|
||||
void init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup);
|
||||
|
||||
public:
|
||||
NVMatrix& operator*() const;
|
||||
|
||||
Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent);
|
||||
Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent,
|
||||
float wc, float wball, float mom, bool useGrad);
|
||||
|
||||
virtual ~Weights();
|
||||
|
||||
virtual NVMatrix& getW() const;
|
||||
virtual NVMatrix& getInc() const;
|
||||
virtual NVMatrix& getGrad() const;
|
||||
virtual Matrix& getCPUW() const;
|
||||
virtual Matrix& getCPUWInc() const;
|
||||
virtual ParameterSchedule& getLearningRateSchedule() const;
|
||||
virtual int getNumRows() const;
|
||||
virtual int getNumCols() const;
|
||||
virtual void copyToCPU();
|
||||
|
||||
// This function is assumed to be called in the order in which the layers
|
||||
// were defined
|
||||
virtual void copyToGPU();
|
||||
|
||||
virtual void update(float progress);
|
||||
virtual void addReplica(Weights& sibling);
|
||||
int incNumUpdates();
|
||||
|
||||
// Returns the number of times a gradient has been computed for this
|
||||
// weight matrix during the current pass (interval between two calls of update())
|
||||
// through the net. This number will only be greater than 1 if this weight matrix
|
||||
// is *shared* by multiple layers in the net.
|
||||
int getNumUpdates() const;
|
||||
float getEps(float progress) const;
|
||||
float getMom() const;
|
||||
float getWC() const;
|
||||
float getWBall() const;
|
||||
bool isUseGrad() const;
|
||||
bool isOwner() const;
|
||||
int getReplicaID();
|
||||
int getDeviceID();
|
||||
Layer& getParent();
|
||||
std::map<int,Weights*>& getReplicas();
|
||||
ISafeBroadcastNetwork& getBroadcaster();
|
||||
IWeightReducer& getReducer();
|
||||
};
|
||||
|
||||
class WeightList {
|
||||
private:
|
||||
std::vector<Weights*> _weightList;
|
||||
public:
|
||||
Weights& operator[](const int idx) const;
|
||||
~WeightList();
|
||||
WeightList();
|
||||
Weights& at(const int i) const;
|
||||
void addWeights(Weights& w);
|
||||
void addReplica(WeightList& sibling);
|
||||
void update(float progress);
|
||||
void copyToCPU();
|
||||
void copyToGPU();
|
||||
int getSize() const;
|
||||
};
|
||||
|
||||
#endif /* WEIGHTS_CUH */
|
123
caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh
Normal file
123
caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef WORKER_CUH
|
||||
#define WORKER_CUH
|
||||
|
||||
#include "convnet.cuh"
|
||||
#include "cost.cuh"
|
||||
#include "data.cuh"
|
||||
|
||||
class ConvNet;
|
||||
class Cost;
|
||||
|
||||
class WorkResult {
|
||||
public:
|
||||
enum RESULTS {BATCH_DONE, SYNC_DONE};
|
||||
protected:
|
||||
WorkResult::RESULTS _resultType;
|
||||
Cost* _results;
|
||||
public:
|
||||
WorkResult(WorkResult::RESULTS resultType, Cost& results);
|
||||
WorkResult(WorkResult::RESULTS resultType);
|
||||
virtual ~WorkResult();
|
||||
Cost& getResults() const;
|
||||
WorkResult::RESULTS getResultType() const;
|
||||
};
|
||||
|
||||
class Worker {
|
||||
protected:
|
||||
ConvNet* _convNet;
|
||||
public:
|
||||
Worker(ConvNet& convNet);
|
||||
virtual ~Worker();
|
||||
virtual bool run() = 0;
|
||||
};
|
||||
|
||||
class DataWorker : public Worker {
|
||||
protected:
|
||||
CPUData* _data;
|
||||
DataProvider* _dp;
|
||||
public:
|
||||
DataWorker(ConvNet& convNet, CPUData& data);
|
||||
virtual ~DataWorker();
|
||||
bool run();
|
||||
virtual void _run() = 0;
|
||||
};
|
||||
|
||||
class TrainingWorker : public DataWorker {
|
||||
protected:
|
||||
bool _test;
|
||||
double _progress;
|
||||
public:
|
||||
TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test);
|
||||
void _run();
|
||||
};
|
||||
|
||||
class SyncWorker : public Worker {
|
||||
public:
|
||||
SyncWorker(ConvNet& convNet);
|
||||
bool run();
|
||||
};
|
||||
|
||||
class ExitWorker : public Worker {
|
||||
public:
|
||||
ExitWorker(ConvNet& convNet);
|
||||
bool run();
|
||||
};
|
||||
|
||||
class GradCheckWorker : public DataWorker {
|
||||
public:
|
||||
GradCheckWorker(ConvNet& convNet, CPUData& data);
|
||||
void _run();
|
||||
};
|
||||
|
||||
class MultiviewTestWorker : public DataWorker {
|
||||
protected:
|
||||
int _numViews;
|
||||
Matrix* _cpuProbs;
|
||||
std::string _logregName;
|
||||
CPUData& getMinibatch(int v, int i);
|
||||
public:
|
||||
MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName);
|
||||
MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews);
|
||||
~MultiviewTestWorker();
|
||||
void _run();
|
||||
};
|
||||
|
||||
class FeatureWorker : public DataWorker {
|
||||
protected:
|
||||
MatrixV *_ftrs;
|
||||
stringv *_layerNames;
|
||||
bool _deleteFeatures;
|
||||
public:
|
||||
FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures=true);
|
||||
~FeatureWorker();
|
||||
void _run();
|
||||
};
|
||||
|
||||
class DataGradWorker : public DataWorker {
|
||||
protected:
|
||||
Matrix* _dataGrads;
|
||||
int _dataLayerIdx, _softmaxLayerIdx;
|
||||
public:
|
||||
DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx);
|
||||
~DataGradWorker();
|
||||
void _run();
|
||||
};
|
||||
|
||||
#endif/* WORKER_CUH */
|
||||
|
107
caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu
Normal file
107
caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu
Normal file
@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "../include/actbroadcaster.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* BroadcastMessage
|
||||
* =====================
|
||||
*/
|
||||
BroadcastMessage::BroadcastMessage(map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue)
|
||||
: _type(BROADCAST), _mats(mats), _srcDevice(srcDevice), _userIdx(userIdx), _finishQueue(&finishQueue) {
|
||||
}
|
||||
|
||||
BroadcastMessage::BroadcastMessage(MESSAGE_TYPE type)
|
||||
: _type(type), _finishQueue(NULL) {
|
||||
}
|
||||
|
||||
int BroadcastMessage::getSrcDevice() {
|
||||
return _srcDevice;
|
||||
}
|
||||
|
||||
map<int, NVMatrix*>& BroadcastMessage::getMatrices() {
|
||||
return _mats;
|
||||
}
|
||||
|
||||
int BroadcastMessage::getUserIdx() {
|
||||
return _userIdx;
|
||||
}
|
||||
|
||||
Queue<int>& BroadcastMessage::getFinishQueue() {
|
||||
return *_finishQueue;
|
||||
}
|
||||
|
||||
BroadcastMessage::MESSAGE_TYPE BroadcastMessage::getMessageType() {
|
||||
return _type;
|
||||
}
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* ExitBroadcastMessage
|
||||
* =====================
|
||||
*/
|
||||
ExitBroadcastMessage::ExitBroadcastMessage() : BroadcastMessage(BroadcastMessage::EXIT) {
|
||||
}
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* ActBroadcaster
|
||||
* =====================
|
||||
*/
|
||||
ActBroadcaster::ActBroadcaster(int numUsers, intv& cpus) : Thread(true, cpus), _numUsers(numUsers) {
|
||||
}
|
||||
|
||||
ActBroadcaster::~ActBroadcaster() {
|
||||
for (map<int,IBroadcastNetwork*>::const_iterator it = _broadcasters.begin(); it != _broadcasters.end(); ++it) {
|
||||
delete it->second;
|
||||
}
|
||||
}
|
||||
|
||||
Queue<BroadcastMessage*>& ActBroadcaster::getMessageQueue() {
|
||||
return _messageQueue;
|
||||
}
|
||||
|
||||
void* ActBroadcaster::run() {
|
||||
int nextUserIdx = 0;
|
||||
bool exit = false;
|
||||
while (!exit) {
|
||||
BroadcastMessage& msg = *_messageQueue.dequeue();
|
||||
if (msg.getMessageType() == BroadcastMessage::EXIT) {
|
||||
exit = true;
|
||||
delete &msg;
|
||||
} else {
|
||||
if (msg.getUserIdx() == nextUserIdx) {
|
||||
if (_broadcasters.count(msg.getSrcDevice()) == 0) {
|
||||
_broadcasters[msg.getSrcDevice()] = &IBroadcastNetwork::make(getKeys(msg.getMatrices()), msg.getSrcDevice());
|
||||
}
|
||||
_broadcasters[msg.getSrcDevice()]->broadcast(msg.getMatrices());
|
||||
msg.getFinishQueue().enqueue(0);
|
||||
delete &msg;
|
||||
nextUserIdx = (nextUserIdx + 1) % _numUsers;
|
||||
} else {
|
||||
_messageQueue.enqueue(&msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void ActBroadcaster::stop() {
|
||||
getMessageQueue().enqueue(new ExitBroadcastMessage());
|
||||
join();
|
||||
}
|
782
caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu
Normal file
782
caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu
Normal file
@ -0,0 +1,782 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <map>
|
||||
|
||||
#include "../../nvmatrix/include/nvmatrix.cuh"
|
||||
#include "../../nvmatrix/include/nvmatrix_operators.cuh"
|
||||
#include "../../util/include/matrix.h"
|
||||
#include "../include/convnet.cuh"
|
||||
#include "../include/util.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* =======================
|
||||
* ConvNet
|
||||
* =======================
|
||||
*/
|
||||
ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs,
|
||||
int minibatchSize, bool conserveMem) : Thread(true) {
|
||||
_deviceIDs = deviceIDs;
|
||||
_data = NULL;
|
||||
_bufferData = NULL;
|
||||
_bufferMinibatchIdx = -1;
|
||||
_bufferPassIdx = -1;
|
||||
_trainingProgress = 0;
|
||||
_totalPassesDone = 0;
|
||||
_conserveMem = conserveMem;
|
||||
_sync = new ThreadSynchronizer(deviceIDs.size() + 1);
|
||||
PyObjectV* layerList = pyDictGetValues(layerParams);
|
||||
std::sort(layerList->begin(), layerList->end(), LayerIDComparator());
|
||||
|
||||
|
||||
_dataCopyPD = new PipeDispenserBlocking(DIVUP(_deviceIDs.size(),2)); // hard-coded for now
|
||||
|
||||
initDataLayers(layerList);
|
||||
initGPUThreads(layerList);
|
||||
connectReplicas(); // Connect replicas to one another
|
||||
connectChildren(layerParams); // Connect forward/backward links in graph
|
||||
_numFwdTerminal = 0;
|
||||
// Execute post-initialization stuff
|
||||
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
for (int r = 0; r < it->second.size(); r++) {
|
||||
_numFwdTerminal += it->second[r]->getNext().size() == 0;
|
||||
if (it->second[r]->getNext().size() == 0) {
|
||||
printf("Fwd terminal: %s\n", it->second[r]->getName().c_str());
|
||||
}
|
||||
it->second[r]->postInit();
|
||||
}
|
||||
}
|
||||
|
||||
// Find and count the terminal nodes in the backward pass
|
||||
for (int p = 0; p < getNumPasses(); p++) {
|
||||
set<Layer*> visited;
|
||||
_numBwdTerminal[p] = 0;
|
||||
for (int t = 0; t < _convNetThreads.size(); t++) {
|
||||
vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
|
||||
for (int c = 0; c < cl.size(); c++) {
|
||||
findBwdTerminal(*cl[c], visited, _numBwdTerminal[p], p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_dp = new DataProvider(minibatchSize);
|
||||
// Py_DECREF(layerList);
|
||||
delete layerList;
|
||||
}
|
||||
|
||||
ConvNet::~ConvNet() {
|
||||
for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
|
||||
(*it)->getMessageQueue().enqueue(new Message(EXIT_CONVNET));
|
||||
(*it)->join();
|
||||
delete *it;
|
||||
}
|
||||
for (DataLayerVector::const_iterator it = _dataLayers.begin(); it != _dataLayers.end(); ++it) {
|
||||
delete *it;
|
||||
}
|
||||
for (intv::const_iterator it = _deviceIDs.begin(); it != _deviceIDs.end(); ++it) {
|
||||
DEVICE_MEMORY_MANAGER::destroyInstance(*it);
|
||||
}
|
||||
HOST_MEMORY_MANAGER::destroyInstance();
|
||||
delete _sync;
|
||||
delete _dataCopyPD;
|
||||
delete _dp;
|
||||
}
|
||||
|
||||
void ConvNet::stop() {
|
||||
getWorkerQueue().enqueue(new ExitWorker(*this));
|
||||
join();
|
||||
}
|
||||
|
||||
PipeDispenser& ConvNet::getDataCopyPD() {
|
||||
return *_dataCopyPD;
|
||||
}
|
||||
|
||||
void ConvNet::initDataLayers(PyObjectV* layerList) {
|
||||
for (int i = 0; i < layerList->size(); i++) {
|
||||
PyObject* paramsDict = layerList->at(i);
|
||||
std::string layerType = pyDictGetString(paramsDict, "type");
|
||||
|
||||
if (layerType == "data") {
|
||||
int numReplicas = pyDictGetInt(paramsDict, "numReplicas");
|
||||
for (int r = 0; r < numReplicas; ++r) {
|
||||
DataLayer* dataLayer = new DataLayer(this, paramsDict, r);
|
||||
_dataLayers.push_back(dataLayer);
|
||||
_layerMap[dataLayer->getName()][r] = dataLayer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNet::initGPUThreads(PyObjectV* layerList) {
|
||||
// Initialize GPU worker threads
|
||||
for (int i = 0; i < _deviceIDs.size(); ++i) {
|
||||
ConvNetThread* cng = new ConvNetThread(layerList, _deviceIDs[i], i, this);
|
||||
_convNetThreads.push_back(cng);
|
||||
for (NameLayerMap::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
|
||||
const std::string& name = it->first;
|
||||
Layer* layer = it->second;
|
||||
_layerMap[name][layer->getReplicaID()] = layer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNet::connectReplicas() {
|
||||
_numReplicasMax = 0;
|
||||
_numReplicasMin = 1 << 16;
|
||||
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
_numReplicasMax = max(_numReplicasMax, int(it->second.size()));
|
||||
_numReplicasMin = min(_numReplicasMin, int(it->second.size()));
|
||||
for (map<int,Layer*>::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) {
|
||||
Layer& l1 = *it2->second;
|
||||
for (map<int,Layer*>::iterator it3 = it->second.begin(); it3 != it->second.end(); ++it3) {
|
||||
Layer& l2 = *it3->second;
|
||||
l1.addReplica(l2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNet::connectChildren(PyObject* layerParams) {
|
||||
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
|
||||
PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
|
||||
if (inputList != NULL) {
|
||||
// Iterate over "replicas" of this layer
|
||||
int numReplicas = _layerMap[it->first].size();
|
||||
for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
|
||||
std::string inputName = PyString_AsString(PyList_GetItem(inputList, i));
|
||||
int numReplicasPrev = _layerMap[inputName].size();
|
||||
// How many replicas from the previous layer must this layer be connected to?
|
||||
int numInputReplicas = numReplicasPrev / numReplicas;
|
||||
for (int r = 0; r < numReplicas; r++) {
|
||||
for (int rp = r, ridx = 0; ridx < numInputReplicas; rp += numReplicas, ridx++) {
|
||||
it->second[r]->addPrev(*_layerMap[inputName][rp], ridx);
|
||||
_layerMap[inputName][rp]->addNext(*it->second[r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNet::findBwdTerminal(Layer& l, set<Layer*>& visited, int& terminal, int passIdx) {
|
||||
if (visited.count(&l) == 0) {
|
||||
visited.insert(&l);
|
||||
if (l.isGradConsumer()) {
|
||||
bool hasPrevConsumer = false;
|
||||
if (l.getPrev().size() > 0) {
|
||||
for (int i = 0; i < l.getPrev()[0].size(); i++) {
|
||||
// Looking only at 0th replica is fine to see if you have
|
||||
// grad consumers below you.
|
||||
hasPrevConsumer |= l.getPrev()[0][i]->isGradConsumer();
|
||||
}
|
||||
}
|
||||
if (!hasPrevConsumer || !l.isGradProducer() || (passIdx + 1 < l.getNumReplicasPrev() && l.getNumReplicasPrev() > l.getNumReplicas())) {
|
||||
terminal++;
|
||||
l.setBwdTerminal(passIdx);
|
||||
printf("found bwd terminal %s[%d] in passIdx=%d\n", l.getName().c_str(), l.getReplicaID(), passIdx);
|
||||
} else if (l.isGradProducer()) {
|
||||
for (int r = 0; r < l.getPrev().size(); r++) {
|
||||
for (int i = 0; i < l.getPrev()[r].size(); i++) {
|
||||
findBwdTerminal(*l.getPrev()[r][i], visited, terminal, passIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void* ConvNet::run() {
|
||||
for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
|
||||
(*it)->start();
|
||||
}
|
||||
// The manager thread defaults to using the GPU of the first worker.
|
||||
// Put more logic here if this is inappropriate.
|
||||
NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
|
||||
copyToGPU();
|
||||
bool exit = false;
|
||||
while (!exit) {
|
||||
Worker* worker = _workerQueue.dequeue();
|
||||
exit = worker->run();
|
||||
delete worker;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Queue<Worker*>& ConvNet::getWorkerQueue() {
|
||||
return _workerQueue;
|
||||
}
|
||||
|
||||
Queue<WorkResult*>& ConvNet::getResultQueue() {
|
||||
return _resultQueue;
|
||||
}
|
||||
|
||||
DataProvider& ConvNet::getDataProvider() {
|
||||
return *_dp;
|
||||
}
|
||||
|
||||
Layer& ConvNet::getLayer(std::string& name, int replicaID) {
|
||||
return *_layerMap[name][replicaID];
|
||||
}
|
||||
|
||||
void ConvNet::sendMessage(MESSAGES msg, bool sync) {
|
||||
sendMessage(new Message(msg), sync);
|
||||
}
|
||||
|
||||
void ConvNet::sendMessage(Message* msg, bool sync) {
|
||||
for (int i = 0; i < _convNetThreads.size(); i++) {
|
||||
_convNetThreads[i]->getMessageQueue().enqueue(msg->clone());
|
||||
}
|
||||
|
||||
delete msg;
|
||||
|
||||
if (sync) {
|
||||
syncWithChildren();
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNet::copyToCPU() {
|
||||
sendMessage(COPY_TO_CPU, true);
|
||||
}
|
||||
|
||||
void ConvNet::copyToGPU() {
|
||||
sendMessage(COPY_TO_GPU, false);
|
||||
}
|
||||
|
||||
void ConvNet::updateWeights(int passIdx) {
|
||||
sendMessage(UPDATE_WEIGHTS, true);
|
||||
sendMessage(CONSTRAIN_WEIGHTS, true);
|
||||
}
|
||||
|
||||
void ConvNet::reset(int passIdx) {
|
||||
sendMessage((passIdx % getNumPasses()) == 0 ? RESET : RESET_PASS_IDX, false);
|
||||
}
|
||||
|
||||
void ConvNet::reset() {
|
||||
reset(0);
|
||||
}
|
||||
|
||||
// Fprop given data
|
||||
void ConvNet::fprop(CPUData& data, int passIdx, PASS_TYPE passType) {
|
||||
reset(passIdx);
|
||||
// This is necessary because setData below could delete data. If there's
|
||||
// an outstanding copy request, this'll cause a segfault.
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->waitForCopyFinish();
|
||||
}
|
||||
|
||||
setData(data, passIdx);
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->fprop(passType, passIdx, false);
|
||||
}
|
||||
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
|
||||
}
|
||||
|
||||
// Fprop given minibatch idx
|
||||
void ConvNet::fprop(int miniIdx, int passIdx, PASS_TYPE passType) {
|
||||
reset(passIdx);
|
||||
|
||||
bool fromBuffer = miniIdx == _bufferMinibatchIdx && passIdx == _bufferPassIdx;
|
||||
if (!fromBuffer) {
|
||||
// This is necessary because setData below could delete data. If there's
|
||||
// an outstanding copy request, this'll cause a segfault.
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->waitForCopyFinish();
|
||||
}
|
||||
|
||||
setData(_dp->getMinibatch(miniIdx), passIdx);
|
||||
|
||||
} else {
|
||||
setDataFromBuffer();
|
||||
}
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->fprop(passType, passIdx, fromBuffer);
|
||||
}
|
||||
|
||||
if (passIdx == getNumPasses() - 1) {
|
||||
// Do double-buffering from next minibatch from the DataProvider
|
||||
setBuffer(miniIdx == _dp->getNumMinibatches() - 1 ? NULL : &_dp->getMinibatch(miniIdx + 1), miniIdx + 1, 0);
|
||||
} else {
|
||||
// Do double-buffering from next microbatch within current minibatch
|
||||
setBuffer(_data, miniIdx, passIdx + 1);
|
||||
}
|
||||
|
||||
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
|
||||
}
|
||||
|
||||
void ConvNet::setDataFromBuffer() {
|
||||
if (_bufferData != _data) {
|
||||
delete _data;
|
||||
}
|
||||
_data = _bufferData;
|
||||
_bufferData = NULL;
|
||||
_bufferMinibatchIdx = -1;
|
||||
_bufferPassIdx = -1;
|
||||
}
|
||||
|
||||
void ConvNet::setData(CPUData& data, int passIdx) {
|
||||
bool same = _data == _bufferData;
|
||||
if (&data != _data) {
|
||||
delete _data;
|
||||
}
|
||||
if (&data != _bufferData && !same) {
|
||||
delete _bufferData;
|
||||
_bufferData = NULL;
|
||||
_bufferMinibatchIdx = -1;
|
||||
_bufferPassIdx = -1;
|
||||
}
|
||||
_data = &data;
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->copyData(*_data, false, passIdx);
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNet::setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx) {
|
||||
_bufferData = bufferData;
|
||||
_bufferMinibatchIdx = bufferMinibatchIdx;
|
||||
_bufferPassIdx = bufferPassIdx;
|
||||
if (bufferData != NULL) {
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->copyData(*_bufferData, true, bufferPassIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CPUData& ConvNet::getData() {
|
||||
assert(_data != NULL);
|
||||
return *_data;
|
||||
}
|
||||
|
||||
void ConvNet::bprop(int passIdx, PASS_TYPE passType) {
|
||||
_totalPassesDone++;
|
||||
sendMessage(new BpropStartMessage(passType, passIdx), false);
|
||||
waitForTerminals(_numBwdTerminal[passIdx], BPROP_TERMINAL);
|
||||
reset(passIdx + 1);
|
||||
}
|
||||
|
||||
void ConvNet::waitForTerminals(int numMsgs, MESSAGES msgType) {
|
||||
for (int rcvd = 0; rcvd < numMsgs; rcvd++) {
|
||||
Message* m = _msgQueue.dequeue();
|
||||
assert(m->getType() == msgType);
|
||||
delete m;
|
||||
}
|
||||
}
|
||||
|
||||
// Same as getCost() but adds results to given cost and returns it
|
||||
Cost& ConvNet::getCost(Cost& cost) {
|
||||
Cost &tmp = getCost();
|
||||
cost += tmp;
|
||||
delete &tmp;
|
||||
return cost;
|
||||
}
|
||||
|
||||
Cost& ConvNet::getCost() {
|
||||
Cost& cost = *new Cost();
|
||||
for (int t = 0; t < _convNetThreads.size(); t++) {
|
||||
Cost& tcost = _convNetThreads[t]->getCost();
|
||||
cost += tcost;
|
||||
delete &tcost;
|
||||
}
|
||||
return cost;
|
||||
}
|
||||
|
||||
double ConvNet::getCostValue() {
|
||||
Cost& cost = getCost();
|
||||
double val = cost.getValue();
|
||||
delete &cost;
|
||||
return val;
|
||||
}
|
||||
|
||||
Queue<Message*>& ConvNet::getMessageQueue() {
|
||||
return _msgQueue;
|
||||
}
|
||||
|
||||
intv& ConvNet::getDeviceIDs() {
|
||||
return _deviceIDs;
|
||||
}
|
||||
|
||||
ThreadSynchronizer& ConvNet::getSync() {
|
||||
return *_sync;
|
||||
}
|
||||
|
||||
void ConvNet::syncWithChildren() {
|
||||
sendMessage(SYNC, false);
|
||||
_sync->sync();
|
||||
}
|
||||
|
||||
int ConvNet::getTotalPassesDone() {
|
||||
return _totalPassesDone;
|
||||
}
|
||||
|
||||
int ConvNet::getMinibatchSize() {
|
||||
return _dp->getMinibatchSize();
|
||||
}
|
||||
|
||||
int ConvNet::getNumReplicasMax() {
|
||||
return _numReplicasMax;
|
||||
}
|
||||
|
||||
int ConvNet::getNumReplicasMin() {
|
||||
return _numReplicasMin;
|
||||
}
|
||||
|
||||
int ConvNet::getNumPasses() {
|
||||
return _numReplicasMax / _numReplicasMin;
|
||||
}
|
||||
|
||||
void ConvNet::setTrainingProgress(double progress) {
|
||||
_trainingProgress = progress;
|
||||
}
|
||||
|
||||
double ConvNet::getTrainingProgress() const {
|
||||
return _trainingProgress;
|
||||
}
|
||||
|
||||
bool ConvNet::isConserveMemory() {
|
||||
return _conserveMem;
|
||||
}
|
||||
|
||||
/*
|
||||
* Gradient checking stuff
|
||||
*/
|
||||
void ConvNet::checkGradients() {
|
||||
_numFailures = 0;
|
||||
_numTests = 0;
|
||||
_baseErr = 0;
|
||||
for (int p = 0; p < getNumPasses(); ++p) {
|
||||
fprop(0, p, PASS_GC);
|
||||
_baseErr += getCostValue();
|
||||
bprop(p, PASS_GC);
|
||||
}
|
||||
// We call grad check only on the first replica,
|
||||
// but because weights are aware of their fellow replicas,
|
||||
// we can simultaneously perturb the weights of all
|
||||
// replicas.
|
||||
for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
map<int, Layer*>& layers = it->second;
|
||||
if (layers[0]->getDeviceID() >= 0 /*&& (layers[0]->getName() == "fc10")*/) { // If layer on GPU (data layers aren't)
|
||||
layers[0]->checkGradient();
|
||||
}
|
||||
}
|
||||
|
||||
cout << "------------------------" << endl;
|
||||
if (_numFailures > 0) {
|
||||
cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
|
||||
} else {
|
||||
cout << "ALL " << _numTests << " TESTS PASSED" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Copies to all replicas
|
||||
void ConvNet::checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights) {
|
||||
int d = NVMatrix::getDeviceID();
|
||||
for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
|
||||
NVMatrix::setDeviceID(it->second->getDeviceID());
|
||||
it->second->getW().copyFromHost(weightsCPU);
|
||||
}
|
||||
NVMatrix::setDeviceID(d);
|
||||
}
|
||||
|
||||
/*
|
||||
* name: weight matrix name
|
||||
* eps: finite difference step
|
||||
*/
|
||||
bool ConvNet::checkGradient(const std::string& name, float eps, Weights& weights) {
|
||||
Matrix numGrad(weights.getNumRows(), weights.getNumCols());
|
||||
Matrix diff(numGrad);
|
||||
numGrad.apply(Matrix::ZERO);
|
||||
Matrix weightsCPU;
|
||||
|
||||
weights.getW().copyToHost(weightsCPU, true);
|
||||
|
||||
for(int i = 0; i < weights.getNumRows(); i++) {
|
||||
for (int j = 0; j < weights.getNumCols(); j++) {
|
||||
float v = weightsCPU(i,j);
|
||||
weightsCPU(i,j) += eps;
|
||||
|
||||
checkGradient_copyWeightsToGPU(weightsCPU, weights);
|
||||
|
||||
weightsCPU(i,j) = v;
|
||||
double err = 0;
|
||||
for (int p = 0; p < getNumPasses(); ++p) {
|
||||
// printf("trying fprop %d\n", p);
|
||||
fprop(0, p, PASS_GC);
|
||||
// printf(" success\n");
|
||||
err += getCostValue();
|
||||
}
|
||||
numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
|
||||
if (isnan((double)numGrad(i,j)) || isinf((double)numGrad(i,j))) {
|
||||
cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
|
||||
cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
|
||||
cout << "Exiting." << endl;
|
||||
exit(1);
|
||||
}
|
||||
checkGradient_copyWeightsToGPU(weightsCPU, weights);
|
||||
}
|
||||
}
|
||||
Matrix gradCPU;
|
||||
NVMatrix::setDeviceID(weights.getDeviceID());
|
||||
map<int,NVMatrix*> mats;
|
||||
for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
|
||||
mats[it->first] = &it->second->getGrad();
|
||||
}
|
||||
weights.getReducer().reduce(mats, 1, false);
|
||||
|
||||
weights.getGrad().copyToHost(gradCPU, true);
|
||||
gradCPU.scale(-1.0 / _data->getNumCases());
|
||||
float analNorm = gradCPU.norm();
|
||||
float numNorm = numGrad.norm();
|
||||
numGrad.subtract(gradCPU, diff);
|
||||
float relErr = diff.norm() / analNorm;
|
||||
bool fail = relErr >= GC_REL_ERR_THRESH;
|
||||
if (fail || !GC_SUPPRESS_PASSES) {
|
||||
cout << "========================" << endl;
|
||||
printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
|
||||
cout << "========================" << endl;
|
||||
cout << "Analytic:" << endl;
|
||||
gradCPU.print(0, 6, 0, 4);
|
||||
cout << "Numeric:" << endl;
|
||||
numGrad.print(0, 6, 0, 4);
|
||||
printf("Analytic norm: %e\n", analNorm);
|
||||
printf("Numeric norm: %e\n", numNorm);
|
||||
printf("Relative error: %e\n", relErr);
|
||||
}
|
||||
_numTests++;
|
||||
_numFailures += fail;
|
||||
return fail;
|
||||
}
|
||||
|
||||
/*
|
||||
* =======================================================================================================
|
||||
* ConvNetThread
|
||||
* =======================================================================================================
|
||||
*/
|
||||
ConvNetThread::ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet)
|
||||
: Thread(true, getDeviceCPUs(deviceID)), _deviceID(deviceID), _convNet(convNet) {
|
||||
try {
|
||||
int numLayers = layerList->size();
|
||||
|
||||
for (int i = 0; i < numLayers; i++) {
|
||||
PyObject* paramsDict = layerList->at(i);
|
||||
std::string layerType = pyDictGetString(paramsDict, "type");
|
||||
if (layerType != "data") {
|
||||
intv& gpus = *pyDictGetIntV(paramsDict, "gpu");
|
||||
int rid = indexOf(gpus, deviceIdx);
|
||||
if (rid >= 0) {
|
||||
initLayer(paramsDict, rid);
|
||||
}
|
||||
delete &gpus;
|
||||
}
|
||||
}
|
||||
} catch (std::string& s) {
|
||||
cout << "Error creating ConvNet: " << s << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
ConvNetThread::~ConvNetThread() {
|
||||
NVMatrix::setDeviceID(_deviceID);
|
||||
NVMatrix::destroyCublas();
|
||||
NVMatrix::destroyRandom();
|
||||
for (NameLayerMap::const_iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
|
||||
delete it->second;
|
||||
}
|
||||
_nameLayerMap.clear();
|
||||
}
|
||||
|
||||
void ConvNetThread::startTimer() {
|
||||
NVMatrix::syncStream();
|
||||
_timer.start();
|
||||
}
|
||||
|
||||
double ConvNetThread::stopTimer() {
|
||||
NVMatrix::syncStream();
|
||||
return _timer.stop();
|
||||
}
|
||||
|
||||
void ConvNetThread::initLayer(PyObject* paramsDict, int replicaID) {
|
||||
std::string type = pyDictGetString(paramsDict, "type");
|
||||
std::string name = pyDictGetString(paramsDict, "name");
|
||||
if (type == "fc") {
|
||||
_nameLayerMap[name] = new FCLayer(this, paramsDict, replicaID, false);
|
||||
} else if (type == "sfc") {
|
||||
_nameLayerMap[name] = new SplitFCLayer(this, paramsDict, replicaID, false);
|
||||
} else if (type == "conv") {
|
||||
_nameLayerMap[name] = new ConvLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "local") {
|
||||
_nameLayerMap[name] = new LocalUnsharedLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "pool") {
|
||||
_nameLayerMap[name] = &PoolLayer::make(this, paramsDict, replicaID);
|
||||
} else if (type == "cmpool") {
|
||||
_nameLayerMap[name] = &CrossMapPoolLayer::make(this, paramsDict, replicaID);
|
||||
} else if (type == "rnorm") {
|
||||
_nameLayerMap[name] = new ResponseNormLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "cmrnorm") {
|
||||
_nameLayerMap[name] = new CrossMapResponseNormLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "cnorm") {
|
||||
_nameLayerMap[name] = new ContrastNormLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "softmax") {
|
||||
_nameLayerMap[name] = new SoftmaxLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "eltsum") {
|
||||
_nameLayerMap[name] = new EltwiseSumLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "eltmax") {
|
||||
_nameLayerMap[name] = new EltwiseMaxLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "neuron") {
|
||||
_nameLayerMap[name] = new NeuronLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "nailbed") {
|
||||
_nameLayerMap[name] = new NailbedLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "blur") {
|
||||
_nameLayerMap[name] = new GaussianBlurLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "href") {
|
||||
_nameLayerMap[name] = new HorizontalReflectionLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "resize") {
|
||||
_nameLayerMap[name] = new ResizeLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "rgb2yuv") {
|
||||
_nameLayerMap[name] = new RGBToYUVLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "rgb2lab") {
|
||||
_nameLayerMap[name] = new RGBToLABLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "rscale") {
|
||||
_nameLayerMap[name] = new RandomScaleLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "crop") {
|
||||
_nameLayerMap[name] = new CropLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "concat") {
|
||||
_nameLayerMap[name] = new ConcatenationLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "pass") {
|
||||
_nameLayerMap[name] = new PassThroughLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "dropout") {
|
||||
_nameLayerMap[name] = new DropoutLayer(this, paramsDict, replicaID);
|
||||
} else if (type == "dropout2") {
|
||||
_nameLayerMap[name] = new Dropout2Layer(this, paramsDict, replicaID);
|
||||
} else if (strncmp(type.c_str(), "cost.", 5) == 0) {
|
||||
CostLayer *c = &CostLayer::make(this, paramsDict, type, replicaID);
|
||||
_nameLayerMap[name] = c;
|
||||
_costs.push_back(c);
|
||||
} else {
|
||||
throw std::string("Unknown layer type ") + type;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This executes in a new CPU thread so it's OK to initialize CUDA stuff here.
|
||||
*/
|
||||
void ConvNetThread::initCuda() {
|
||||
NVMatrix::setDeviceID(_deviceID);
|
||||
checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
|
||||
for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
|
||||
int d = _convNet->getDeviceIDs()[i];
|
||||
if (d != _deviceID) {
|
||||
if (NVMatrix::canAccessPeer(_deviceID, d)) {
|
||||
printf("Enabling peer access GPU %d --> GPU %d\n", NVMatrix::getDeviceID(), d);
|
||||
checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
|
||||
} else {
|
||||
printf("No peer access GPU %d --> GPU %d\n", _deviceID, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMatrix::syncStream();
|
||||
NVMatrix::initCublas();
|
||||
NVMatrix::initRandom(/*7*/);
|
||||
srand(time(0));
|
||||
}
|
||||
|
||||
void* ConvNetThread::run() {
|
||||
initCuda();
|
||||
bool exit = false;
|
||||
while (!exit) {
|
||||
Message* m = _msgQueue.dequeue();
|
||||
if (m->getType() == FPROP_READY) {
|
||||
FpropMessage* msg = static_cast<FpropMessage*>(m);
|
||||
msg->getToLayer().fprop(msg->getPassType(), msg->getPassIdx());
|
||||
} else if (m->getType() == BPROP_READY) {
|
||||
BpropMessage* msg = static_cast<BpropMessage*>(m);
|
||||
msg->getToLayer().incRcvdBInputMsgs();
|
||||
msg->getToLayer().bprop(msg->getPassType(), msg->getPassIdx());
|
||||
} else if (m->getType() == BPROP_START) {
|
||||
BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
|
||||
for (int i = 0; i < _costs.size(); i++) {
|
||||
dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType(), msg->getPassIdx());
|
||||
}
|
||||
} else if (m->getType() == SYNC) {
|
||||
NVMatrix::syncStream();
|
||||
_convNet->getSync().sync();
|
||||
} else if (m->getType() == COPY_TO_CPU) {
|
||||
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
|
||||
it->second->copyToCPU();
|
||||
}
|
||||
} else if (m->getType() == COPY_TO_GPU) {
|
||||
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
|
||||
it->second->copyToGPU();
|
||||
}
|
||||
} else if (m->getType() == RESET) {
|
||||
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
|
||||
it->second->reset();
|
||||
}
|
||||
} else if (m->getType() == RESET_PASS_IDX) {
|
||||
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
|
||||
it->second->resetPassIdx();
|
||||
}
|
||||
} else if (m->getType() == UPDATE_WEIGHTS) {
|
||||
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
|
||||
it->second->updateWeights();
|
||||
}
|
||||
} else if (m->getType() == CONSTRAIN_WEIGHTS) {
|
||||
for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
|
||||
it->second->constrainWeights();
|
||||
}
|
||||
} else if (m->getType() == EXIT_CONVNET) {
|
||||
exit = true;
|
||||
}
|
||||
delete m;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Cost& ConvNetThread::getCost() {
|
||||
// In a single ConvNetThread, all costs are guaranteed to be different
|
||||
// (i.e. not replicas of one another)
|
||||
return *new Cost(_costs);
|
||||
}
|
||||
|
||||
Layer& ConvNetThread::getLayer(std::string& name) {
|
||||
return *_nameLayerMap[name];
|
||||
}
|
||||
|
||||
int ConvNetThread::getDeviceID() {
|
||||
return _deviceID;
|
||||
}
|
||||
|
||||
Queue<Message*>& ConvNetThread::getMessageQueue() {
|
||||
return _msgQueue;
|
||||
}
|
||||
|
||||
vector<CostLayer*>& ConvNetThread::getCostLayers() {
|
||||
return _costs;
|
||||
}
|
||||
|
||||
NameLayerMap& ConvNetThread::getLayerMap() {
|
||||
return _nameLayerMap;
|
||||
}
|
||||
|
||||
ConvNet& ConvNetThread::getConvNet() {
|
||||
return *_convNet;
|
||||
}
|
378
caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu
Normal file
378
caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu
Normal file
@ -0,0 +1,378 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "../include/copypipeline.cuh"
|
||||
//#include "gpu_util.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/* =========================
|
||||
* ICopySegment
|
||||
* =========================
|
||||
*/
|
||||
ICopySegment::ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue)
|
||||
: _parent(&parent), _prev(NULL), _stream(NULL), _deviceID(deviceID), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getSourceDeviceID())) {
|
||||
_execDeviceID = _deviceID;
|
||||
}
|
||||
|
||||
ICopySegment::~ICopySegment() {
|
||||
if (_stream != NULL) {
|
||||
checkCudaErrors(cudaStreamDestroy(_stream));
|
||||
}
|
||||
}
|
||||
|
||||
void* ICopySegment::run() {
|
||||
assert(_execDeviceID != DEVICE_HOST);
|
||||
NVMatrix::setDeviceID(_execDeviceID);
|
||||
checkCudaErrors(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
|
||||
bool exit = false;
|
||||
while (!exit) {
|
||||
CopyMessage& msg = *_queue.dequeue();
|
||||
if (msg.getType() == CopyMessage::EXIT) {
|
||||
exit = true;
|
||||
} else {
|
||||
bool term = processMessage(msg);
|
||||
if (term) {
|
||||
assert(_finishQueue != NULL);
|
||||
_finishQueue->enqueue(1);
|
||||
}
|
||||
}
|
||||
delete &msg;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
NVMatrix& ICopySegment::getChunk(NVMatrix& mat, int chunkSize, int chunkIdx) {
|
||||
NVMatrix& line = mat.reshaped(1, mat.getNumElements());
|
||||
int start = chunkIdx * chunkSize;
|
||||
int end = min((chunkIdx+1) * chunkSize, mat.getNumElements());
|
||||
NVMatrix& chunk = line.sliceCols(start, end);
|
||||
delete &line;
|
||||
return chunk;
|
||||
}
|
||||
|
||||
inline NVMatrix& ICopySegment::getMatrix(CopyMessage& msg) {
|
||||
if (getDeviceID() == DEVICE_HOST) {
|
||||
return _hmat;
|
||||
}
|
||||
return msg.getMatrix(getDeviceID());
|
||||
}
|
||||
|
||||
Queue<CopyMessage*>& ICopySegment::getQueue() {
|
||||
return _queue;
|
||||
}
|
||||
|
||||
inline int ICopySegment::getDeviceID() {
|
||||
return _deviceID;
|
||||
}
|
||||
|
||||
void ICopySegment::addPrev(ICopySegment& c) {
|
||||
_prev = &c;
|
||||
if (_deviceID == DEVICE_HOST) {
|
||||
_execDeviceID = c.getDeviceID();
|
||||
}
|
||||
}
|
||||
|
||||
void ICopySegment::addNext(CopyPeer& c) {
|
||||
_next.push_back(&c);
|
||||
c.addPrev(*this);
|
||||
}
|
||||
|
||||
bool ICopySegment::isTerminal() const {
|
||||
return _next.size() == 0;
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* CopySource
|
||||
* =========================
|
||||
*/
|
||||
CopySource::CopySource(IBroadcastNetwork& parent, int deviceID) : ICopySegment(parent, deviceID, NULL) {
|
||||
}
|
||||
|
||||
bool CopySource::processMessage(CopyMessage& msg) {
|
||||
assert(msg.getType() == CopyMessage::COPY_START);
|
||||
int numChunks = min(getMatrix(msg).getNumElements(), max(COPY_MIN_CHUNKS, min(COPY_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), COPY_MIN_CHUNK_SIZE))));
|
||||
int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks);
|
||||
// printf("num chunks: %d\n", numChunks);
|
||||
for (int c = 0; c <= numChunks; ++c) {
|
||||
for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
|
||||
(*it)->getQueue().enqueue(new CopyChunkMessage(c, chunkSize, numChunks, msg.getScaleSource(), msg.getScaleTargets(), msg.getMatrices()));
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool CopySource::isSource() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* CopyPeer
|
||||
* =========================
|
||||
*/
|
||||
CopyPeer::CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue) : ICopySegment(parent, deviceID, finishQueue) {
|
||||
}
|
||||
|
||||
bool CopyPeer::processMessage(CopyMessage& msg) {
|
||||
assert(msg.getType() == CopyMessage::COPY_CHUNK);
|
||||
CopyChunkMessage& cmsg = *static_cast<CopyChunkMessage*>(&msg);
|
||||
if (cmsg.getChunkIdx() < cmsg.getNumChunks()) {
|
||||
if (!isTerminal() || (isTerminal() && msg.getScaleTargets() == 0)) {
|
||||
getMatrix(msg).resize(_prev->getMatrix(msg));
|
||||
}
|
||||
// getMatrix(msg).printShape("getMatrix(msg)");
|
||||
// _prev->getMatrix(msg).printShape("_prev->getMatrix(msg)");
|
||||
assert(getMatrix(msg).isSameDims(_prev->getMatrix(msg)));
|
||||
const float scaleSelf = isTerminal() ? msg.getScaleTargets() : 0;
|
||||
const float scalePrev = _prev->isSource() ? msg.getScaleSource() : 1;
|
||||
NVMatrix& prevChunk = getChunk(_prev->getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
|
||||
NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
|
||||
prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, _stream);
|
||||
NVMatrix::syncStream(_stream);
|
||||
delete &prevChunk;
|
||||
delete &myChunk;
|
||||
}
|
||||
for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
|
||||
(*it)->getQueue().enqueue(new CopyChunkMessage(cmsg));
|
||||
}
|
||||
return cmsg.getChunkIdx() >= cmsg.getNumChunks() && isTerminal();
|
||||
}
|
||||
|
||||
inline bool CopyPeer::isSource() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* IBroadcastNetwork
|
||||
* =========================
|
||||
*/
|
||||
IBroadcastNetwork& IBroadcastNetwork::make(set<int> devices, int srcDevice) {
|
||||
if (devices.size() == 8) {
|
||||
return (new EightGPUBroadcaster1(devices, srcDevice))->construct();
|
||||
} else if (devices.size() == 1) {
|
||||
return (new NullBroadcaster(devices, srcDevice))->construct();
|
||||
} else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
|
||||
return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
|
||||
}
|
||||
return (new NaiveBroadcaster(devices, srcDevice))->construct();
|
||||
}
|
||||
|
||||
IBroadcastNetwork::IBroadcastNetwork(set<int>& devices, int srcDeviceID, int numTerminal)
|
||||
: _devices(devices), _srcDeviceID(srcDeviceID), _numTerminal(numTerminal), _constructed(false), _src(NULL) {
|
||||
}
|
||||
|
||||
IBroadcastNetwork::~IBroadcastNetwork() {
|
||||
vector<ICopySegment*> v;
|
||||
v.insert(v.end(), _peers.begin(), _peers.end());
|
||||
v.insert(v.end(), _src);
|
||||
for (vector<ICopySegment*>::const_iterator it = v.begin(); it != v.end(); ++it) {
|
||||
(*it)->getQueue().enqueue(new CopyMessage(CopyMessage::EXIT));
|
||||
(*it)->join();
|
||||
delete *it;
|
||||
}
|
||||
}
|
||||
|
||||
IBroadcastNetwork& IBroadcastNetwork::construct() {
|
||||
assert(!_constructed);
|
||||
pair<vector<int>,vector<int> > gpus = makeGPULists();
|
||||
_src = new CopySource(*this, _srcDeviceID);
|
||||
makePeers(gpus);
|
||||
makeConnections();
|
||||
_src->start();
|
||||
for (vector<CopyPeer*>::const_iterator it = _peers.begin(); it != _peers.end(); ++it) {
|
||||
(*it)->start();
|
||||
}
|
||||
_constructed = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
pair<vector<int>,vector<int> > IBroadcastNetwork::makeGPULists() {
|
||||
vector<int> same, other;
|
||||
for (set<int>::const_iterator it = _devices.begin(); it != _devices.end(); ++it) {
|
||||
if (*it != _srcDeviceID) {
|
||||
if (NVMatrix::canAccessPeer(_srcDeviceID, *it)) {
|
||||
same.insert(same.begin() + rand() % (1 + same.size()), *it);
|
||||
} else {
|
||||
other.insert(other.begin() + rand() % (1 + other.size()), *it);
|
||||
}
|
||||
}
|
||||
}
|
||||
return pair<vector<int>,vector<int> >(same, other);
|
||||
}
|
||||
|
||||
void IBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats) {
|
||||
_broadcast(mats, 1, 0);
|
||||
}
|
||||
|
||||
void IBroadcastNetwork::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
|
||||
assert(_constructed);
|
||||
assert(_finishQueue.getNumElements() == 0);
|
||||
assert(mats.size() == _devices.size());
|
||||
assert(mats.size() > 1);
|
||||
if (mats[_srcDeviceID]->getNumElements() == 0) {
|
||||
for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
|
||||
it->second->resize(*mats[_srcDeviceID]);
|
||||
}
|
||||
} else {
|
||||
_src->getQueue().enqueue(new CopyStartMessage(scaleSource, scaleTargets, mats));
|
||||
for (int i = 0; i < _numTerminal; ++i) {
|
||||
_finishQueue.dequeue();
|
||||
}
|
||||
}
|
||||
assert(_finishQueue.getNumElements() == 0);
|
||||
}
|
||||
|
||||
int IBroadcastNetwork::getSourceDeviceID() const {
|
||||
return _srcDeviceID;
|
||||
}
|
||||
|
||||
void IBroadcastNetwork::makePeers(pair<vector<int>,vector<int> >& gpus) {
|
||||
vector<int>& same = gpus.first, &other = gpus.second;
|
||||
for (int i = 0; i < same.size(); ++i) {
|
||||
_peers.push_back(new CopyPeer(*this, same[i], &_finishQueue));
|
||||
}
|
||||
for (int i = 0; i < other.size(); ++i) {
|
||||
_peers.push_back(new CopyPeer(*this, other[i], &_finishQueue));
|
||||
}
|
||||
_peers.push_back(new CopyPeer(*this, DEVICE_HOST, &_finishQueue)); // peers[7]
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* ISafeBroadcastNetwork
|
||||
* =========================
|
||||
*/
|
||||
ISafeBroadcastNetwork& ISafeBroadcastNetwork::make(set<int> devices, int srcDevice) {
|
||||
if (devices.size() == 1) {
|
||||
return (new NullBroadcaster(devices, srcDevice))->construct();
|
||||
} else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
|
||||
return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
|
||||
}
|
||||
return (new NaiveBroadcaster(devices, srcDevice))->construct();
|
||||
}
|
||||
|
||||
ISafeBroadcastNetwork::ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal) : IBroadcastNetwork(devices, srcDeviceID, numTerminal) {
|
||||
}
|
||||
|
||||
void ISafeBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
|
||||
_broadcast(mats, scaleSource, scaleTargets);
|
||||
}
|
||||
|
||||
ISafeBroadcastNetwork& ISafeBroadcastNetwork::construct() {
|
||||
IBroadcastNetwork::construct();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* NullBroadcaster
|
||||
* =========================
|
||||
*/
|
||||
NullBroadcaster::NullBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
|
||||
}
|
||||
|
||||
void NullBroadcaster::makeConnections() {
|
||||
}
|
||||
|
||||
NullBroadcaster& NullBroadcaster::construct() {
|
||||
_constructed = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
|
||||
}
|
||||
|
||||
void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats) {
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* NaiveBroadcaster
|
||||
* =========================
|
||||
*
|
||||
* This one does src -> host -> all
|
||||
*/
|
||||
NaiveBroadcaster::NaiveBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, devices.size()-1) {
|
||||
}
|
||||
|
||||
void NaiveBroadcaster::makeConnections() {
|
||||
_src->addNext(*_peers.back()); // Make connection src -> host
|
||||
for (int i = 0; i < _peers.size() - 1; ++i) {
|
||||
if (_peers[i]->getDeviceID() != _src->getDeviceID()) {
|
||||
_peers.back()->addNext(*_peers[i]); // Make connection host -> peer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* EightGPUBroadcaster1
|
||||
* =========================
|
||||
*
|
||||
* This one does a fancy graph
|
||||
*/
|
||||
EightGPUBroadcaster1::EightGPUBroadcaster1(set<int>& devices, int srcDeviceID) : IBroadcastNetwork(devices, srcDeviceID, 4) {
|
||||
}
|
||||
|
||||
void EightGPUBroadcaster1::makeConnections() {
|
||||
_src->addNext(*_peers[7]);
|
||||
_peers[7]->addNext(*_peers[0]);
|
||||
_peers[7]->addNext(*_peers[1]);
|
||||
_peers[7]->addNext(*_peers[3]);
|
||||
_peers[7]->addNext(*_peers[4]);
|
||||
|
||||
_peers[1]->addNext(*_peers[2]);
|
||||
_peers[3]->addNext(*_peers[5]);
|
||||
_peers[4]->addNext(*_peers[6]);
|
||||
}
|
||||
|
||||
/* =========================
|
||||
* TwoPeeringGPUsBroadcaster
|
||||
* =========================
|
||||
*/
|
||||
TwoPeeringGPUsBroadcaster::TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
|
||||
_tgtDeviceID = *devices.begin() == srcDeviceID ? *(++devices.begin()) : *devices.begin();
|
||||
}
|
||||
|
||||
TwoPeeringGPUsBroadcaster::~TwoPeeringGPUsBroadcaster() {
|
||||
if (_constructed) {
|
||||
checkCudaErrors(cudaStreamDestroy(_tgtStream));
|
||||
}
|
||||
}
|
||||
|
||||
void TwoPeeringGPUsBroadcaster::makeConnections() {
|
||||
}
|
||||
|
||||
void TwoPeeringGPUsBroadcaster::resetDeviceID(int d) {
|
||||
if (d >= 0) {
|
||||
NVMatrix::setDeviceID(d);
|
||||
}
|
||||
}
|
||||
|
||||
ISafeBroadcastNetwork& TwoPeeringGPUsBroadcaster::construct() {
|
||||
assert(!_constructed);
|
||||
int d = NVMatrix::getDeviceID();
|
||||
NVMatrix::setDeviceID(_tgtDeviceID);
|
||||
checkCudaErrors(cudaStreamCreateWithFlags(&_tgtStream, cudaStreamNonBlocking));
|
||||
resetDeviceID(d);
|
||||
_constructed = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
void TwoPeeringGPUsBroadcaster::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
|
||||
int d = NVMatrix::getDeviceID();
|
||||
NVMatrix::setDeviceID(_tgtDeviceID);
|
||||
mats[_tgtDeviceID]->add(*mats[_srcDeviceID], scaleTargets, scaleSource, *mats[_tgtDeviceID], _tgtStream);
|
||||
NVMatrix::syncStream(_tgtStream);
|
||||
resetDeviceID(d);
|
||||
}
|
||||
|
113
caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu
Normal file
113
caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include "../include/cost.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* Cost
|
||||
* =====================
|
||||
*/
|
||||
|
||||
Cost::Cost() {
|
||||
}
|
||||
|
||||
Cost::Cost(vector<CostLayer*>& costs) {
|
||||
for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
|
||||
_costMap[(*it)->getName()] = &(*it)->getCost();
|
||||
_costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
|
||||
_numCases[(*it)->getName()] = (*it)->getNumCases();
|
||||
}
|
||||
}
|
||||
|
||||
int Cost::getNumCases() {
|
||||
return _numCases.size() == 0 ? 0 : _numCases.begin()->second;
|
||||
}
|
||||
|
||||
map<std::string,int>& Cost::getNumCasesMap() {
|
||||
return _numCases;
|
||||
}
|
||||
|
||||
doublev& Cost::operator [](const std::string s) {
|
||||
return *_costMap[s];
|
||||
}
|
||||
|
||||
CostMap& Cost::getCostMap() {
|
||||
return _costMap;
|
||||
}
|
||||
|
||||
CostCoeffMap& Cost::getCostCoeffMap() {
|
||||
return _costCoeffMap;
|
||||
}
|
||||
|
||||
double Cost::getValue() {
|
||||
double val = 0;
|
||||
for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
|
||||
val += _costCoeffMap[it->first] * (it->second->size() == 0 ? 0 : it->second->at(0));
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
Cost& Cost::operator += (Cost& er) {
|
||||
CostMap& otherMap = er.getCostMap();
|
||||
CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
|
||||
|
||||
for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
|
||||
bool newCost = _costMap.count(it->first) == 0;
|
||||
if (newCost) {
|
||||
_costMap[it->first] = new doublev();
|
||||
_costCoeffMap[it->first] = otherCoeffMap[it->first];
|
||||
_numCases[it->first] = er.getNumCasesMap()[it->first];
|
||||
} else {
|
||||
_numCases[it->first] += er.getNumCasesMap()[it->first];
|
||||
}
|
||||
|
||||
doublev& myVec = *_costMap[it->first];
|
||||
doublev& otherVec = *otherMap[it->first];
|
||||
assert(myVec.size() == 0 || otherVec.size() == 0 || myVec.size() == otherVec.size());
|
||||
// Add costs from otherVec to me
|
||||
for (int i = 0; i < otherVec.size(); i++) {
|
||||
if (myVec.size() <= i) {
|
||||
myVec.push_back(0);
|
||||
}
|
||||
myVec[i] += otherVec[i];
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Cost::~Cost() {
|
||||
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
|
||||
delete it->second;
|
||||
}
|
||||
}
|
||||
|
||||
void Cost::print() {
|
||||
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
|
||||
printf("%s (%.3f): ", it->first.c_str(), _costCoeffMap[it->first]);
|
||||
doublev& vec = *_costMap[it->first];
|
||||
for (int z = 0; z < vec.size(); ++z) {
|
||||
printf("%.3f", vec[z]);
|
||||
if (z < vec.size() - 1) {
|
||||
printf(", ");
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
82
caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu
Normal file
82
caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu
Normal file
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "../../util/include/matrix.h"
|
||||
#include "../include/data.cuh"
|
||||
#include "../include/timer.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
DataProvider::DataProvider(int minibatchSize) :
|
||||
_minibatchSize(minibatchSize), _hData(NULL) {
|
||||
}
|
||||
|
||||
void DataProvider::clearData() {
|
||||
delete _hData;
|
||||
_hData = NULL;
|
||||
}
|
||||
|
||||
void DataProvider::setData(CPUData& hData) {
|
||||
// DataWorker calls clearData
|
||||
_hData = &hData;
|
||||
assert(_hData != NULL);
|
||||
}
|
||||
|
||||
CPUData& DataProvider::getMinibatch(int idx) {
|
||||
assert(idx >= 0 && idx < getNumMinibatches());
|
||||
return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
|
||||
}
|
||||
|
||||
CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
|
||||
assert(_hData != 0);
|
||||
assert(_hData->getNumCases() > 0);
|
||||
endCase = min(_hData->getNumCases(), endCase);
|
||||
// TODO: maintain these matrices, no point re-creating them all the time
|
||||
MatrixV& miniData = *new MatrixV();
|
||||
|
||||
for (int i = 0; i < _hData->getData().size(); i++) {
|
||||
// NOTE: if hData is transposed, then the output minibatch matrix
|
||||
// can be a view. No need to allocate new CPU memory here. Might
|
||||
// want to look into optimizing that in the future, though it's
|
||||
// unlikely to be a big deal.
|
||||
if (_hData->isTrans()) {
|
||||
miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
|
||||
} else {
|
||||
miniData.push_back(new Matrix());
|
||||
(*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
|
||||
}
|
||||
}
|
||||
CPUData& cpuData = *new CPUData(&miniData);
|
||||
return *new CPUData(&miniData);
|
||||
}
|
||||
|
||||
int DataProvider::getNumMinibatches() {
|
||||
assert(_hData != 0);
|
||||
assert(_hData->getNumCases() > 0);
|
||||
return DIVUP(_hData->getNumCases(), _minibatchSize);
|
||||
}
|
||||
|
||||
int DataProvider::getMinibatchSize() {
|
||||
return _minibatchSize;
|
||||
}
|
||||
|
||||
int DataProvider::getNumCases() {
|
||||
assert(_hData != 0);
|
||||
assert(_hData->getNumCases() > 0);
|
||||
return _hData->getNumCases();
|
||||
}
|
202
caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu
Normal file
202
caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu
Normal file
@ -0,0 +1,202 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "../include/util.cuh"
|
||||
#include "../include/gradreducer.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/* =====================
|
||||
* IGradReducer
|
||||
* =====================
|
||||
*/
|
||||
IActGradReducer::IActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
|
||||
: Thread(true, getDeviceCPUs(parent.getDeviceID())), _parent(&parent), _numExpectedMsgs(numExpectedMsgs) {
|
||||
_numExpectedMsgsTotal = 0;
|
||||
for (map<int,int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
|
||||
_numExpectedMsgsTotal += it->second;
|
||||
}
|
||||
// printf("%s[%d] expected %d backward msgs\n", parent.getName().c_str(), parent.getReplicaID(), _numExpectedMsgsTotal);
|
||||
}
|
||||
|
||||
IActGradReducer::~IActGradReducer() {
|
||||
|
||||
}
|
||||
|
||||
void* IActGradReducer::run() {
|
||||
while (true) {
|
||||
reset();
|
||||
if (reduce()) {
|
||||
break;
|
||||
}
|
||||
_finishQueue.enqueue(0);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Cost layer will have nothing to dequeue, so just return immediately.
|
||||
int IActGradReducer::waitForFinish() {
|
||||
if (_numExpectedMsgsTotal > 0) {
|
||||
int i = _finishQueue.dequeue();
|
||||
assert(_finishQueue.getNumElements() == 0);
|
||||
return i;
|
||||
}
|
||||
// printf("%s not waiting for finish\n", _name.c_str());
|
||||
return 0;
|
||||
}
|
||||
|
||||
IActGradReducer& IActGradReducer::makeGradReducer(Layer& parent, map<int, int> numExpectedMsgs) {
|
||||
int tgtDeviceID = parent.getDeviceID();
|
||||
if (numExpectedMsgs.count(tgtDeviceID) == 0) {
|
||||
numExpectedMsgs[tgtDeviceID] = 0;
|
||||
}
|
||||
if (numExpectedMsgs.size() == 8) {
|
||||
return *new ParallelActGradReducer(parent, numExpectedMsgs);
|
||||
}
|
||||
return *new SequentialActGradReducer(parent, numExpectedMsgs);
|
||||
}
|
||||
|
||||
/* =====================
|
||||
* SequentialGradReducer
|
||||
* =====================
|
||||
*/
|
||||
SequentialActGradReducer::SequentialActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
|
||||
: IActGradReducer(parent, numExpectedMsgs) {
|
||||
intv deviceIDs;
|
||||
int tgtDeviceID = parent.getDeviceID();
|
||||
for (map<int, int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
|
||||
if (it->first != tgtDeviceID) {
|
||||
deviceIDs.push_back(it->first);
|
||||
}
|
||||
}
|
||||
if (numExpectedMsgs[tgtDeviceID] > 0) {
|
||||
deviceIDs.push_back(tgtDeviceID);
|
||||
}
|
||||
|
||||
sort(deviceIDs.begin(), deviceIDs.end());
|
||||
|
||||
int firstDeviceIdx = 0, firstDeviceID = 1 << 16;
|
||||
for (int i = 0; i < deviceIDs.size(); ++i) {
|
||||
if (deviceIDs[i] >= tgtDeviceID && deviceIDs[i] < firstDeviceID) {
|
||||
firstDeviceIdx = i;
|
||||
firstDeviceID = deviceIDs[i];
|
||||
}
|
||||
}
|
||||
|
||||
// This is the order in which we process devices.
|
||||
for (int i = firstDeviceIdx; _deviceIDs.size() < deviceIDs.size(); i = (i + 1) % deviceIDs.size()) {
|
||||
int d = deviceIDs[i];
|
||||
_deviceIDs.push_back(d);
|
||||
_messageQueues[d] = new Queue<int>();
|
||||
}
|
||||
//shuffleVector(_deviceIDs, 1, _deviceIDs.size());
|
||||
_broadcaster = new StreamBroadcast();
|
||||
|
||||
// Note that we MUST process the tgtDeviceID first because
|
||||
// we write to it at every iteration, and the computation
|
||||
// thread writes to it too. By processing it first we ensure
|
||||
// that there's no race condition.
|
||||
assert(numExpectedMsgs[tgtDeviceID] == 0 || _deviceIDs[0] == tgtDeviceID);
|
||||
reset();
|
||||
}
|
||||
|
||||
SequentialActGradReducer::~SequentialActGradReducer() {
|
||||
for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
|
||||
delete it->second;
|
||||
}
|
||||
delete _broadcaster;
|
||||
}
|
||||
|
||||
void SequentialActGradReducer::reset() {
|
||||
for (map<int,int>::iterator it = _numReceivedMsgs.begin(); it != _numReceivedMsgs.end(); ++it) {
|
||||
_numReceivedMsgs[it->first] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool SequentialActGradReducer::reduce() {
|
||||
int tgtDeviceID = _parent->getDeviceID();
|
||||
for (int didx = 0; didx < _deviceIDs.size(); ) {
|
||||
int d = _deviceIDs[didx];
|
||||
_numReceivedMsgs[d] += _messageQueues[d]->dequeue();
|
||||
if (_numReceivedMsgs[d] == _numExpectedMsgs[d]) {
|
||||
if (d != tgtDeviceID) {
|
||||
NVMatrix::setDeviceID(tgtDeviceID);
|
||||
|
||||
_parent->getActsGrad().resize(_parent->getActsGrad(d));
|
||||
map<int, NVMatrix*> mats;
|
||||
mats[d] = &_parent->getActsGrad(d);
|
||||
mats[tgtDeviceID] = &_parent->getActsGrad(tgtDeviceID);
|
||||
|
||||
_broadcaster->transfer(mats, d, didx > 0, 1);
|
||||
}
|
||||
didx++;
|
||||
assert(_messageQueues[d]->getNumElements() == 0);
|
||||
} else if (_numReceivedMsgs[d] >= _numExpectedMsgs[d]) { // exit
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void SequentialActGradReducer::enqueueReduction(int deviceID) {
|
||||
_messageQueues[deviceID]->enqueue(1);
|
||||
}
|
||||
|
||||
void SequentialActGradReducer::stop() {
|
||||
for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
|
||||
it->second->enqueue(ACT_GRAD_REDUCER_EXIT);
|
||||
}
|
||||
join();
|
||||
}
|
||||
|
||||
/* =====================
|
||||
* ParallelActGradReducer
|
||||
* =====================
|
||||
*/
|
||||
ParallelActGradReducer::ParallelActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
|
||||
: IActGradReducer(parent, numExpectedMsgs), _numReceivedMsgs(0) {
|
||||
_reducer = &(new EightGPUReducer1(parent.getDeviceID()))->construct();
|
||||
|
||||
_scaleTarget = numExpectedMsgs.count(parent.getDeviceID()) > 0 && numExpectedMsgs[parent.getDeviceID()] > 0;
|
||||
}
|
||||
|
||||
bool ParallelActGradReducer::reduce() {
|
||||
// TODO: make it so that you can start the reduction before you've received all the messages.
|
||||
while(_numReceivedMsgs < _numExpectedMsgsTotal) {
|
||||
_numReceivedMsgs += _messageQueue.dequeue();
|
||||
}
|
||||
if (_numReceivedMsgs > _numExpectedMsgsTotal) {
|
||||
return true; // exit
|
||||
}
|
||||
map<int,NVMatrix*> mats = _parent->getAllActsGrads();
|
||||
_reducer->reduce(mats, 1, _scaleTarget);
|
||||
assert(_messageQueue.getNumElements() == 0);
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
void ParallelActGradReducer::enqueueReduction(int deviceID) {
|
||||
_messageQueue.enqueue(1);
|
||||
}
|
||||
|
||||
void ParallelActGradReducer::stop() {
|
||||
_messageQueue.enqueue(ACT_GRAD_REDUCER_EXIT);
|
||||
join();
|
||||
}
|
||||
|
||||
void ParallelActGradReducer::reset() {
|
||||
_numReceivedMsgs = 0;
|
||||
}
|
135
caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp
Normal file
135
caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp
Normal file
@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "../include/jpeg.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/* ========================
|
||||
* DecoderThread
|
||||
* ========================
|
||||
*/
|
||||
DecoderThread::DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview)
|
||||
: Thread(true), _pyList(pyList), _target(&target), _start_img(start_img), _end_img(end_img),
|
||||
_img_size(img_size), _inner_size(inner_size), _test(test), _multiview(multiview),
|
||||
_decodeTarget(0), _decodeTargetSize(0) {
|
||||
|
||||
_inner_pixels = _inner_size * _inner_size;
|
||||
_rseed = time(0);
|
||||
}
|
||||
|
||||
DecoderThread::~DecoderThread(){
|
||||
free(_decodeTarget);
|
||||
}
|
||||
|
||||
void* DecoderThread::run() {
|
||||
int numSrcCases = PyList_GET_SIZE(_pyList);
|
||||
assert(_target->getNumCols() == _inner_pixels * 3);
|
||||
assert(_target->getNumRows() == PyList_GET_SIZE(_pyList) * (_multiview ? 10 : 1));
|
||||
|
||||
int width, height;
|
||||
|
||||
for (int64 i = _start_img; i < _end_img; ++i) {
|
||||
decodeJpeg(i, width, height);
|
||||
assert((width == _img_size && height >= _img_size)
|
||||
|| (height == _img_size && width >= _img_size));
|
||||
if (_multiview) {
|
||||
for (int flip = 0; flip < 2; ++flip) {
|
||||
crop(numSrcCases * (flip * 5 + 0) + i, width, height, flip, 0, 0); // top-left
|
||||
crop(numSrcCases * (flip * 5 + 1) + i, width, height, flip, width - _inner_size, 0); // top-right
|
||||
crop(numSrcCases * (flip * 5 + 2) + i, width, height, flip, (width - _inner_size) / 2, (height - _inner_size) / 2); // center
|
||||
crop(numSrcCases * (flip * 5 + 3) + i, width, height, flip, 0, height - _inner_size); // bottom-left
|
||||
crop(numSrcCases * (flip * 5 + 4) + i, width, height, flip, width - _inner_size, height - _inner_size); // bottom-right
|
||||
}
|
||||
} else {
|
||||
crop(i, width, height, !_test && (rand_r(&_rseed) % 2));
|
||||
}
|
||||
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void DecoderThread::decodeJpeg(int idx, int& width, int& height) {
|
||||
PyObject* pySrc = PyList_GET_ITEM(_pyList, idx);
|
||||
unsigned char* src = (unsigned char*)PyString_AsString(pySrc);
|
||||
size_t src_len = PyString_GET_SIZE(pySrc);
|
||||
|
||||
struct jpeg_decompress_struct cinf;
|
||||
struct jpeg_error_mgr jerr;
|
||||
cinf.err = jpeg_std_error(&jerr);
|
||||
jpeg_create_decompress(&cinf);
|
||||
jpeg_mem_src(&cinf, src, src_len);
|
||||
assert(jpeg_read_header(&cinf, TRUE));
|
||||
cinf.out_color_space = JCS_RGB;
|
||||
assert(jpeg_start_decompress(&cinf));
|
||||
assert(cinf.num_components == 3 || cinf.num_components == 1);
|
||||
width = cinf.image_width;
|
||||
height = cinf.image_height;
|
||||
|
||||
if (_decodeTargetSize < width * height * 3) {
|
||||
free(_decodeTarget);
|
||||
_decodeTargetSize = width * height * 3 * 3;
|
||||
_decodeTarget = (unsigned char*)malloc(_decodeTargetSize);
|
||||
}
|
||||
|
||||
while (cinf.output_scanline < cinf.output_height) {
|
||||
JSAMPROW tmp = &_decodeTarget[width * cinf.out_color_components * cinf.output_scanline];
|
||||
assert(jpeg_read_scanlines(&cinf, &tmp, 1) > 0);
|
||||
}
|
||||
assert(jpeg_finish_decompress(&cinf));
|
||||
jpeg_destroy_decompress(&cinf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Uniform in [0,1)
|
||||
*/
|
||||
inline double DecoderThread::randUniform() {
|
||||
return double(rand_r(&_rseed)) / (int64(RAND_MAX) + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Uniform in [min, max)
|
||||
*/
|
||||
inline double DecoderThread::randUniform(double min, double max) {
|
||||
return (max - min) * randUniform() + min;
|
||||
}
|
||||
|
||||
void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip) {
|
||||
crop(i, src_width, src_height, flip, -1, -1);
|
||||
}
|
||||
|
||||
void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y) {
|
||||
const int64 border_size_y = src_height - _inner_size;
|
||||
const int64 border_size_x = src_width - _inner_size;
|
||||
if (crop_start_x < 0) {
|
||||
crop_start_x = _test ? (border_size_x / 2) : (rand_r(&_rseed) % (border_size_x + 1));
|
||||
}
|
||||
if (crop_start_y < 0) {
|
||||
crop_start_y = _test ? (border_size_y / 2) : (rand_r(&_rseed) % (border_size_y + 1));
|
||||
}
|
||||
const int64 src_pixels = src_width * src_height;
|
||||
for (int64 c = 0; c < 3; ++c) {
|
||||
for (int64 y = crop_start_y; y < crop_start_y + _inner_size; ++y) {
|
||||
for (int64 x = crop_start_x; x < crop_start_x + _inner_size; ++x) {
|
||||
assert((y >= 0 && y < src_height && x >= 0 && x < src_width));
|
||||
_target->getCell(i, c * _inner_pixels + (y - crop_start_y) * _inner_size
|
||||
+ (flip ? (_inner_size - 1 - x + crop_start_x)
|
||||
: (x - crop_start_x)))
|
||||
= _decodeTarget[3 * (y * src_width + x) + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
2306
caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu
Normal file
2306
caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu
Normal file
File diff suppressed because it is too large
Load Diff
555
caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu
Normal file
555
caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu
Normal file
@ -0,0 +1,555 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include "../include/layer_kernels.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxEnergies: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
* top5Probs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*
|
||||
*/
|
||||
__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
|
||||
float* labelLogProbs, float* correctProbs, float* top5Probs,
|
||||
const int numCases, const int numOut, const int setSize) {
|
||||
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
|
||||
|
||||
if (tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
const float maxp = maxProbs[tx];
|
||||
const float labelp = probs[label * numCases + tx];
|
||||
|
||||
labelLogProbs[tx] = __logf(labelp);
|
||||
|
||||
int numBiggerProbs = 0, numEqualsProbs = 0;
|
||||
for (int i = 0; i < numOut; ++i) {
|
||||
numBiggerProbs += probs[i * numCases + tx] > labelp;
|
||||
numEqualsProbs += probs[i * numCases + tx] == labelp;
|
||||
}
|
||||
|
||||
const int slotsLeft = setSize - numBiggerProbs;
|
||||
|
||||
top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
|
||||
correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
* top5Probs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases) == log(y_l[labels,:]
|
||||
*/
|
||||
void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
|
||||
NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize) {
|
||||
int numCases = probs.getNumCols();
|
||||
int numOut = probs.getNumRows();
|
||||
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isContiguous());
|
||||
|
||||
// NVMatrix& maxProbs = probs.max(0);
|
||||
|
||||
labelLogProbs_out.resize(1, numCases);
|
||||
correctProbs_out.resize(1, numCases);
|
||||
top5Probs_out.resize(1, numCases);
|
||||
dim3 threads(LOGREG_ERR_THREADS_X, 1);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
|
||||
cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
|
||||
kMultiSoftmaxCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
|
||||
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
|
||||
numCases, numOut, setSize);
|
||||
|
||||
getLastCudaError("kMultiSoftmaxCost: Kernel execution failed");
|
||||
// cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum(p_l * log(y_l))
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*/
|
||||
__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
|
||||
const int numCases, const int numOut) {
|
||||
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
|
||||
|
||||
if (tx < numCases) {
|
||||
probs += tx;
|
||||
labels += tx;
|
||||
maxProbs += tx;
|
||||
labelLogProbs += tx;
|
||||
correctProbs += tx;
|
||||
|
||||
const float maxp = maxProbs[0];
|
||||
|
||||
/*
|
||||
* Compute the probability of guessing the correct case if you take the most-probable label.
|
||||
*
|
||||
* This is done like this:
|
||||
*
|
||||
* - If the most probable label is not equal to the true label, then the probability is zero.
|
||||
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
|
||||
*
|
||||
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
|
||||
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
|
||||
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
|
||||
*/
|
||||
float crossEnt = 0.0f;
|
||||
int numMax = 0;
|
||||
bool correctLabel = false;
|
||||
for (int i = 0; i < numOut; i++) {
|
||||
const float label_prob = labels[i * numCases];
|
||||
const float model_prob = probs[i * numCases];
|
||||
numMax += model_prob == maxp;
|
||||
crossEnt += label_prob * safelog(model_prob);
|
||||
correctLabel |= model_prob == maxp && label_prob > 0.0f;
|
||||
}
|
||||
labelLogProbs[0] = crossEnt;
|
||||
if (!correctLabel) {
|
||||
correctProbs[0] = 0.0f;
|
||||
} else {
|
||||
correctProbs[0] = 1.0f / float(numMax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum(p_l * log(y_l))
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
*
|
||||
* dE_dy_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
const float label_prob = labels[tidx];
|
||||
const float model_prob = y_l[tidx];
|
||||
const float v = gradCoeff * __fdividef(label_prob, model_prob);
|
||||
if (add) {
|
||||
dE_dy_l[tidx] += v;
|
||||
} else {
|
||||
dE_dy_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum(p_l * log(y_l))
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
*
|
||||
* dE_dx_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
const float model_prob = y_l[tidx];
|
||||
const float label_prob = labels[tidx];
|
||||
float v = gradCoeff * (label_prob - model_prob);
|
||||
if (add) {
|
||||
dE_dx_l[tidx] += v;
|
||||
} else {
|
||||
dE_dx_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*/
|
||||
__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
|
||||
const int numCases, const int numOut) {
|
||||
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
|
||||
|
||||
if (tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
const float maxp = maxProbs[tx];
|
||||
const float labelp = probs[label * numCases + tx];
|
||||
|
||||
labelLogProbs[tx] = __logf(labelp);
|
||||
|
||||
/*
|
||||
* Compute the probability of guessing the correct case if you take the most-probable label.
|
||||
*
|
||||
* This is done like this:
|
||||
*
|
||||
* - If the most probable label is not equal to the true label, then the probability is zero.
|
||||
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
|
||||
*
|
||||
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
|
||||
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
|
||||
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
|
||||
*/
|
||||
if (labelp != maxp) {
|
||||
correctProbs[tx] = 0;
|
||||
} else {
|
||||
int numMax = 0;
|
||||
for (int i = 0; i < numOut; i++) {
|
||||
numMax += probs[i * numCases + tx] == maxp;
|
||||
}
|
||||
correctProbs[tx] = 1.0f / float(numMax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
*
|
||||
* dE_dy_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
float v = gradCoeff * (label == ty);
|
||||
v = __fdividef(v, y_l[tidx]);
|
||||
if (add) {
|
||||
dE_dy_l[tidx] += v;
|
||||
} else {
|
||||
dE_dy_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
*
|
||||
* dE_dx_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
float v = gradCoeff * ((label == ty) - y_l[tidx]);
|
||||
if (add) {
|
||||
dE_dx_l[tidx] += v;
|
||||
} else {
|
||||
dE_dx_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* dE_dy_l: (numOut, numCases)
|
||||
* y_l: (numOut, numCases)
|
||||
*
|
||||
* dE_dx_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut, const float scaleTarget, const float scaleGrad) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
float v = 0;
|
||||
for (int j = 0; j < numOut; j++) {
|
||||
v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
|
||||
}
|
||||
v *= y_l[tidx];
|
||||
|
||||
if (add) {
|
||||
dE_dx_l[tidx] = scaleTarget * dE_dx_l[tidx] + scaleGrad * v;
|
||||
} else {
|
||||
dE_dx_l[tidx] = scaleGrad * v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int B_X, bool add>
|
||||
__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
|
||||
const int numElements) {
|
||||
for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
|
||||
if (add) {
|
||||
target[i] += actGrad[i] * (output[i] == input[i]);
|
||||
} else {
|
||||
target[i] = actGrad[i] * (output[i] == input[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
|
||||
assert(actGrad.isContiguous());
|
||||
assert(output.isContiguous());
|
||||
assert(input.isContiguous());
|
||||
assert(actGrad.isSameDims(input));
|
||||
assert(actGrad.isSameDims(output));
|
||||
|
||||
dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
|
||||
dim3 threads(128);
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
if (add) {
|
||||
assert(actGrad.isSameDims(target));
|
||||
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
|
||||
kEltwiseMaxGrad<128, true><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
|
||||
} else {
|
||||
target.resize(actGrad);
|
||||
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
|
||||
kEltwiseMaxGrad<128, false><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
|
||||
}
|
||||
|
||||
getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum_i{-p_i*log(y_i)}
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*/
|
||||
void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
|
||||
int numCases = probs.getNumCols();
|
||||
int numOut = probs.getNumRows();
|
||||
|
||||
assert(labels.isSameDims(probs));
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isContiguous());
|
||||
|
||||
NVMatrix& maxProbs = probs.max(0);
|
||||
|
||||
labelLogProbs_out.resize(1, numCases);
|
||||
correctProbs_out.resize(1, numCases);
|
||||
dim3 threads(LOGREG_ERR_THREADS_X, 1);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
|
||||
kCrossEntCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
|
||||
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
|
||||
numCases, numOut);
|
||||
getLastCudaError("kCrossEntCost: Kernel execution failed");
|
||||
|
||||
delete &maxProbs;
|
||||
}
|
||||
|
||||
void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.isSameDims(probs));
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
kCrossEntGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
kCrossEntGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
|
||||
getLastCudaError("kCrossEntGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad) {
|
||||
int numCases = acts.getLeadingDim();
|
||||
int numOut = acts.getFollowingDim();
|
||||
|
||||
assert(acts.isSameDims(actsGrad));
|
||||
assert(acts.isContiguous());
|
||||
assert(actsGrad.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(acts.isTrans());
|
||||
assert(actsGrad.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
|
||||
if (scaleTarget == 0) {
|
||||
target.resize(acts);
|
||||
kSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
|
||||
} else {
|
||||
kSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
|
||||
}
|
||||
getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isTrans());
|
||||
assert(!labels.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
|
||||
kCrossEntSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
|
||||
kCrossEntSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases) == log(y_l[labels,:]
|
||||
*/
|
||||
void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
|
||||
int numCases = probs.getNumCols();
|
||||
int numOut = probs.getNumRows();
|
||||
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isContiguous());
|
||||
|
||||
labelLogProbs_out.resize(1, numCases);
|
||||
correctProbs_out.resize(1, numCases);
|
||||
dim3 threads(LOGREG_ERR_THREADS_X, 1);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
|
||||
kLogregCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
|
||||
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
|
||||
numCases, numOut);
|
||||
getLastCudaError("computeLogregCost: Kernel execution failed");
|
||||
}
|
||||
|
||||
void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
kLogregCostGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
kLogregCostGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
|
||||
getLastCudaError("computeLogregGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
cudaStream_t stream = NVMatrix::getDefaultStream();
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
kLogregSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
kLogregSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
|
||||
getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
|
||||
}
|
114
caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu
Normal file
114
caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu
Normal file
@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include "../include/lr.cuh"
|
||||
#include "../include/util.cuh"
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* ParameterSchedule
|
||||
* ==================================
|
||||
*/
|
||||
ParameterSchedule& ParameterSchedule::make(PyObject* schedDict) {
|
||||
std::string type = pyDictGetString(schedDict, "type");
|
||||
PyObject* paramsDict = PyDict_GetItemString(schedDict, "params");
|
||||
double base = pyDictGetFloat(paramsDict, "base");
|
||||
if (type == "const") {
|
||||
return *new ParameterSchedule(base);
|
||||
} else {
|
||||
double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
|
||||
if (type == "linear") {
|
||||
return *new LinearParameterSchedule(base, tgtFactor);
|
||||
} else if (type == "exp") {
|
||||
return *new ExpParameterSchedule(base, tgtFactor);
|
||||
} else if (type == "dexp") {
|
||||
double numSteps = pyDictGetInt(paramsDict, "numSteps");
|
||||
return *new DiscreteExpParameterSchedule(base, tgtFactor, numSteps);
|
||||
}
|
||||
}
|
||||
throw std::string("Unknown learning rate schedule type ") + type;
|
||||
}
|
||||
|
||||
ParameterSchedule::ParameterSchedule(double baseRate)
|
||||
: _baseRate(baseRate) {
|
||||
}
|
||||
|
||||
double ParameterSchedule::getValue(double progress) {
|
||||
return _baseRate;
|
||||
}
|
||||
|
||||
double ParameterSchedule::getBaseValue() const {
|
||||
return _baseRate;
|
||||
}
|
||||
|
||||
ParameterSchedule::~ParameterSchedule() {
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* LinearParameterSchedule
|
||||
* ==================================
|
||||
*/
|
||||
LinearParameterSchedule::LinearParameterSchedule(double baseRate, double tgtFactor)
|
||||
: ParameterSchedule(baseRate) {
|
||||
_finalRate = baseRate / tgtFactor;
|
||||
}
|
||||
|
||||
double LinearParameterSchedule::getValue(double progress) {
|
||||
return _baseRate * (1 - progress) + _finalRate * progress;
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* ExpParameterSchedule
|
||||
* ==================================
|
||||
*/
|
||||
ExpParameterSchedule::ExpParameterSchedule(double baseRate, double tgtFactor)
|
||||
: ParameterSchedule(baseRate) {
|
||||
_powBase = 1.0 / tgtFactor;
|
||||
}
|
||||
|
||||
double ExpParameterSchedule::getValue(double progress) {
|
||||
return _baseRate * std::pow(_powBase, progress);
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* DiscreteExpParameterSchedule
|
||||
* ==================================
|
||||
*/
|
||||
DiscreteExpParameterSchedule::DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps)
|
||||
: ParameterSchedule(baseRate) {
|
||||
ExpParameterSchedule elrs(baseRate, tgtFactor);
|
||||
double finalRate = baseRate / tgtFactor;
|
||||
for (int i = 0; i < numSteps - 1; i++) {
|
||||
double progress = double(i) / (numSteps - 1);
|
||||
_rates.push_back(elrs.getValue(progress));
|
||||
}
|
||||
_rates.push_back(finalRate);
|
||||
//printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
|
||||
}
|
||||
|
||||
double DiscreteExpParameterSchedule::getValue(double progress) {
|
||||
for (int i = 0; i < _rates.size(); ++i) {
|
||||
if (progress <= double(i + 1) / _rates.size()) {
|
||||
return _rates[i];
|
||||
}
|
||||
}
|
||||
return _rates.back();
|
||||
}
|
||||
|
139
caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu
Normal file
139
caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu
Normal file
@ -0,0 +1,139 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "../include/memorysource.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* =======================
|
||||
* MemoryView
|
||||
* =======================
|
||||
*/
|
||||
MemoryView::MemoryView(MemorySource& src, std::string& name) : _src(&src), _name(name) {
|
||||
}
|
||||
|
||||
MemoryView::~MemoryView() {
|
||||
// if (_src->truncate(_name)) {
|
||||
// delete _src;
|
||||
// }
|
||||
}
|
||||
|
||||
NVMatrix& MemoryView::getMemory(int numCases) {
|
||||
return _src->getMemory(_name, numCases);
|
||||
}
|
||||
|
||||
NVMatrix& MemoryView::getMemory() {
|
||||
return _src->getMemory(_name);
|
||||
}
|
||||
|
||||
MemorySource& MemoryView::getMemorySource() {
|
||||
return *_src;
|
||||
}
|
||||
|
||||
bool MemoryView::isParent() {
|
||||
return _src->getRange(_name).first == 0 && _src->getRange(_name).second == _src->getSize();
|
||||
}
|
||||
|
||||
std::string& MemoryView::getName() {
|
||||
return _name;
|
||||
}
|
||||
|
||||
MemoryView& MemoryView::clone(std::string& name) {
|
||||
return _src->addUser(name, _src->getRange(_name));
|
||||
}
|
||||
|
||||
/*
|
||||
* =======================
|
||||
* MemorySource
|
||||
* =======================
|
||||
*/
|
||||
MemorySource::MemorySource(int size, int deviceID) : _size(size), _deviceID(deviceID) {
|
||||
}
|
||||
|
||||
MemorySource::~MemorySource() {
|
||||
// Each MemoryView is deleted by owner Layer, and the last one deletes the MemorySource.
|
||||
// So this is a no-op.
|
||||
}
|
||||
|
||||
NVMatrix& MemorySource::getMemory(std::string& name) {
|
||||
return getMemory(name, _memory.getLeadingDim());
|
||||
}
|
||||
|
||||
// Deletes old view when appropriate
|
||||
NVMatrix& MemorySource::getMemory(std::string& name, int numCases) {
|
||||
numCases = numCases < 0 ? _memory.getLeadingDim() : numCases;
|
||||
_lock.acquire();
|
||||
if (_memory.getLeadingDim() != numCases || _memory.getFollowingDim() != _size) {
|
||||
int d = NVMatrix::getDeviceID();
|
||||
NVMatrix::setDeviceID(_deviceID);
|
||||
_memory.resize(_size, numCases, false);
|
||||
for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
|
||||
delete it->second;
|
||||
}
|
||||
_memoryViews.clear();
|
||||
if (d >= 0) {
|
||||
NVMatrix::setDeviceID(d);
|
||||
}
|
||||
}
|
||||
if (_memoryViews.count(name) == 0) {
|
||||
assert(!_memory.isTrans());
|
||||
_memoryViews[name] = &_memory.sliceRows(_viewRanges[name].first, _viewRanges[name].second);
|
||||
}
|
||||
NVMatrix& view = *_memoryViews[name];
|
||||
assert(view.isContiguous());
|
||||
_lock.release();
|
||||
return view;
|
||||
}
|
||||
|
||||
MemoryView& MemorySource::addUser(std::string& name, std::pair<int,int> range) {
|
||||
assert(_viewRanges.count(name) == 0);
|
||||
_viewRanges[name] = range;
|
||||
return *new MemoryView(*this, name);
|
||||
}
|
||||
|
||||
MemoryView& MemorySource::addUser(std::string& name) {
|
||||
return addUser(name, std::pair<int,int>(0, _size));
|
||||
}
|
||||
|
||||
MemoryView& MemorySource::make(int size, int deviceID, std::string& parentUser) {
|
||||
return (new MemorySource(size, deviceID))->addUser(parentUser);
|
||||
}
|
||||
|
||||
pair<int,int> MemorySource::getRange(std::string& name) {
|
||||
return _viewRanges[name];
|
||||
}
|
||||
|
||||
int MemorySource::getSize() {
|
||||
return _size;
|
||||
}
|
||||
|
||||
bool MemorySource::truncate(std::string& name) {
|
||||
bool truncated = false;
|
||||
_lock.acquire();
|
||||
_truncateRequests.insert(name);
|
||||
if (_truncateRequests.size() == _viewRanges.size()) {
|
||||
for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
|
||||
delete it->second;
|
||||
}
|
||||
_memoryViews.clear();
|
||||
_memory.truncate();
|
||||
_truncateRequests.clear();
|
||||
truncated = true;
|
||||
}
|
||||
_lock.release();
|
||||
return truncated;
|
||||
}
|
75
caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu
Normal file
75
caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu
Normal file
@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "../include/neuron.cuh"
|
||||
#include "../include/util.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
|
||||
std::string type = pyDictGetString(neuronDict, "type");
|
||||
PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
|
||||
|
||||
if (type == "relu") {
|
||||
return *new ReluNeuron();
|
||||
}
|
||||
|
||||
if (type == "drelu") {
|
||||
return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
|
||||
}
|
||||
|
||||
if (type == "softrelu") {
|
||||
return *new SoftReluNeuron();
|
||||
}
|
||||
|
||||
if (type == "brelu") {
|
||||
return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
|
||||
}
|
||||
|
||||
if (type == "abs") {
|
||||
return *new AbsNeuron();
|
||||
}
|
||||
|
||||
if (type == "logistic") {
|
||||
return *new LogisticNeuron();
|
||||
}
|
||||
|
||||
if (type == "tanh") {
|
||||
return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
|
||||
}
|
||||
|
||||
if (type == "square") {
|
||||
return *new SquareNeuron();
|
||||
}
|
||||
|
||||
if (type == "sqrt") {
|
||||
return *new SqrtNeuron();
|
||||
}
|
||||
|
||||
if (type == "linear") {
|
||||
return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
|
||||
}
|
||||
|
||||
if (type == "log") {
|
||||
return *new LogNeuron(pyDictGetFloat(neuronParamsDict, "a"));
|
||||
}
|
||||
|
||||
if (type == "ident") {
|
||||
return *new Neuron();
|
||||
}
|
||||
|
||||
throw std::string("Unknown neuron type: ") + type;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user