Remove caffe2 contrib and experiments (#125038)

This PR tries to decompose #122527 into a smaller one.
To be noted, this was inspired and is co-dev with @r-barnes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/125038
Approved by: https://github.com/malfet
This commit is contained in:
cyy
2024-04-29 06:27:13 +00:00
committed by PyTorch MergeBot
parent 555f1aeb02
commit 5585138db9
189 changed files with 1 additions and 37411 deletions

View File

@ -446,7 +446,6 @@ cu_library(
# caffe2
CAFFE2_COPTS = COMMON_COPTS + [
"-Dcaffe2_EXPORTS",
"-DCAFFE2_USE_GLOO",
"-DCAFFE2_USE_CUDNN",
"-DCAFFE2_BUILD_MAIN_LIB",
"-fvisibility-inlines-hidden",
@ -454,22 +453,6 @@ CAFFE2_COPTS = COMMON_COPTS + [
"-fno-trapping-math",
]
filegroup(
name = "caffe2_contrib_srcs",
srcs = [
"caffe2/contrib/aten/aten_op.cc",
"caffe2/contrib/gloo/allgather_ops.cc",
"caffe2/contrib/gloo/allreduce_ops.cc",
"caffe2/contrib/gloo/barrier_ops.cc",
"caffe2/contrib/gloo/broadcast_ops.cc",
"caffe2/contrib/gloo/common.cc",
"caffe2/contrib/gloo/common_world_ops.cc",
"caffe2/contrib/gloo/context.cc",
"caffe2/contrib/gloo/reduce_scatter_ops.cc",
"caffe2/contrib/gloo/store_handler.cc",
],
)
filegroup(
name = "caffe2_core_srcs",
srcs = [
@ -1024,10 +1007,6 @@ filegroup(
filegroup(
name = "caffe2_cuda_cpp_srcs",
srcs = [
"caffe2/contrib/aten/aten_op_gpu.cc",
"caffe2/contrib/gloo/allreduce_ops_gpu.cc",
"caffe2/contrib/gloo/broadcast_ops_gpu.cc",
"caffe2/contrib/gloo/common_world_ops_gpu.cc",
"caffe2/core/blob_serialization_gpu.cc",
"caffe2/core/common_cudnn.cc",
"caffe2/core/common_gpu.cc",
@ -1271,35 +1250,10 @@ cc_library(
],
)
py_binary(
name = "gen_op",
srcs = ["caffe2/contrib/aten/gen_op.py"],
deps = ["//torchgen"],
)
genrule(
name = "generated_caffe2_aten_op_headers",
srcs = [
"caffe2/contrib/aten/aten_op_template.h",
"aten/src/ATen/Declarations.yaml",
],
outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
cmd = """
$(location :gen_op) \
--output_prefix gen_ \
--install_dir $(@D) \
--aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
--template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
--yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
tools = [":gen_op"],
)
cc_library(
name = "caffe2_headers",
hdrs = glob(
[
"caffe2/contrib/aten/*.h",
"caffe2/contrib/gloo/*.h",
"caffe2/core/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
@ -1338,10 +1292,9 @@ cc_library(
) + if_cuda(glob([
"caffe2/**/*.cuh",
"caffe2/image/*.h",
])) + [":generated_caffe2_aten_op_headers"],
])),
copts = CAFFE2_COPTS,
includes = [
"caffe2/contrib/aten",
"caffe2/core/nomnigraph/include",
],
visibility = ["//visibility:public"],
@ -1385,7 +1338,6 @@ cc_library(
"caffe2/db/create_db_op.cc",
"caffe2/db/protodb.cc",
"caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
":caffe2_contrib_srcs",
":caffe2_core_srcs",
":caffe2_distributed_srcs",
":caffe2_ideep_srcs",
@ -1419,7 +1371,6 @@ cc_library(
"@fbgemm//:fbgemm_src_headers",
"@fmt",
"@foxi",
"@gloo",
"@onnx",
] + if_cuda(
[
@ -1467,7 +1418,6 @@ cu_library(
"@cuda//:curand",
"@cudnn",
"@eigen",
"@gloo",
"@tensorpipe//:tensorpipe_cuda",
],
alwayslink = True,

View File

@ -59,23 +59,7 @@ if(INTERN_BUILD_ATEN_OPS)
# Generate the headers wrapped by our operator
file(GLOB_RECURSE torchgen_python "${PROJECT_SOURCE_DIR}/torchgen/*.py")
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h
COMMAND
"${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
--aten_root=${CMAKE_CURRENT_SOURCE_DIR}/../aten
--template_dir=${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten
--yaml_dir=${CMAKE_BINARY_DIR}/aten/src/ATen
--install_dir=${CMAKE_CURRENT_BINARY_DIR}/contrib/aten
DEPENDS
${torchgen_python}
${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/aten_op_template.h)
add_custom_target(__aten_op_header_gen
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h)
add_library(aten_op_header_gen INTERFACE)
add_dependencies(aten_op_header_gen __aten_op_header_gen)
# Add source, includes, and libs to lists
list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@ -132,7 +116,6 @@ endif()
# Skip modules that are not used by libtorch mobile yet.
if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
add_subdirectory(contrib)
add_subdirectory(predictor)
add_subdirectory(predictor/emulator)
add_subdirectory(core/nomnigraph)
@ -141,7 +124,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
endif()
add_subdirectory(db)
add_subdirectory(distributed)
# add_subdirectory(experiments) # note, we may remove this folder at some point
add_subdirectory(ideep)
add_subdirectory(image)
add_subdirectory(video)

View File

@ -1,37 +0,0 @@
add_subdirectory(aten)
add_subdirectory(nccl)
add_subdirectory(opencl)
add_subdirectory(prof)
add_subdirectory(shm_mutex)
add_subdirectory(fakelowp)
if(USE_TENSORRT)
add_subdirectory(tensorrt)
endif()
# Only build Gloo Caffe2 ops on Linux, as it hardcodes
# the Linux-specific `gloo::transport::tcp` namespace.
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
add_subdirectory(gloo)
endif()
# Pass the src lists back to the parent
# CPU source, include, deps, test sources, binary sources
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_INCLUDE ${Caffe2_CPU_INCLUDE} PARENT_SCOPE)
set(Caffe2_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
# GPU source, include, deps, test sources, binary sources
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_INCLUDE ${Caffe2_GPU_INCLUDE} PARENT_SCOPE)
set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
# HIP sources, include, test sources
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_INCLUDE ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)

View File

@ -1,12 +0,0 @@
if(NOT INTERN_BUILD_MOBILE AND BUILD_CAFFE2_OPS)
# Add source generated by Codegen.cmake and pass to parent
list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc)
list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_gpu.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
if(USE_ROCM)
list(APPEND Caffe2_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/hip/aten_op_gpu.cc)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
endif()
endif()

View File

@ -1,80 +0,0 @@
# An ATen operator for Caffe2
ATen is a simple tensor library thats exposes the Tensor operations in Torch
and PyTorch directly in C++17. This library provides a generated wrapper around the ATen API
that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
ToffeeIR.
### Example Usage in Caffe2
First identify a function in ATen you want to call in Functions.h,
Tensor.h, or Type.h.
We will call the `pow` operator:
```
static inline Tensor pow(const Tensor & self, Scalar exponent);
```
Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`,
and there is always a string attribute `operator` that defines which ATen function to call:
```
import numpy as np
from caffe2.python import core, workspace
# create the Caffe2 Op:
op = core.CreateOperator(
"ATen",
["MyInput"],
["MyOutput"],
operator="pow", exponent=2.0)
```
Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob.
Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes.
In the case of `Scalar` the attributes can be either an integers or floating point numbers.
The op can now be run like any other Caffe2 operator:
```
workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32))
workspace.RunOperatorOnce(op)
print(workspace.FetchBlob("MyOutput")
```
For methods, the first input is always the `this` Tensor in C++.
To call methods of ATen's `Type` objects, you provide an additional string attribute
that determines the type:
```
# create a 2x4 tensor filled with floating point ones
op = core.CreateOperator(
"ATen",
[],
["MyOutput"],
operator="ones", type="Float", size={2,4})
```
Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
### Example Usage via PyTorch Symbolic
The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported
to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API:
```
class Add(torch.autograd.Function):
@staticmethod
def symbolic(g, a, b):
return g.at("add", a, b)
@staticmethod
def forward(ctx, a, b):
return a + b
```

View File

@ -1,56 +0,0 @@
#include "caffe2/contrib/aten/aten_op.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace internal {
at::Tensor index_with_uint8_handling(
const at::Tensor& self,
const torch::List<std::optional<at::Tensor>>& indices) {
// Support BC only for the simplest case of mask indexing
if (indices.size() == 1) {
std::optional<at::Tensor> first = indices[0];
if (first.has_value()
&& first->scalar_type() == at::kByte) {
TORCH_WARN(
"Indexing with uint8 mask tensor in ATenOp is now deprecated,"
" please use a bool mask instead.");
return at::index(self, {first->to(at::kBool)});
}
}
return at::index(self, indices);
}
} // namespace internal
REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
template <>
at::Backend ATenOp<CPUContext>::backend() const {
return at::Backend::CPU;
}
OPERATOR_SCHEMA(ATen);
namespace math {
template <>
void Set<at::Half, CPUContext>(
const std::int64_t /* N */,
const at::Half h,
at::Half* v,
CPUContext* c) {
Set(0, h.x, (uint16_t*)v, c);
}
template <>
void Set<at::BFloat16, CPUContext>(
const std::int64_t /* N */,
const at::BFloat16 b,
at::BFloat16* v,
CPUContext* c) {
Set(0, b.x, (uint16_t*)v, c);
}
} // namespace math
} // namespace caffe2

View File

@ -1 +0,0 @@
#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"

View File

@ -1,12 +0,0 @@
#include "caffe2/contrib/aten/aten_op.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
template<>
at::Backend ATenOp<CUDAContext>::backend() const {
return at::Backend::CUDA;
}
}

View File

@ -1,237 +0,0 @@
#pragma once
#include <unordered_map>
#include <string>
#include <ATen/Functions.h>
#include <c10/macros/Macros.h>
#include <c10/util/irange.h>
#include <caffe2/core/context.h>
#include <caffe2/core/operator.h>
#include <caffe2/utils/math.h>
#include <iostream>
// a map from descriptor strings (see [DESCRIPTORS])
// to the key in the switch statement that implements them
static std::unordered_map<std::string, int> op_to_key = {
${mappings}
};
namespace caffe2 {
using at::Half; // for AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ...)
namespace internal {
TORCH_API at::Tensor index_with_uint8_handling(
const at::Tensor& self,
const torch::List<std::optional<at::Tensor>>& indices);
}
template <class Context>
class ATenOp : public Operator<Context> {
public:
ATenOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {
VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n";
switch(findImplementation(operator_def)) {
${cases}
default:
CAFFE_THROW("Unexpected key value for aten operator");
}
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
return run_op();
}
private:
// actual operator implementation is initialized in ctor.
std::function<bool()> run_op;
at::Backend backend() const;
TypeMeta typeMetaFor(const at::Tensor & t) {
return typeMetaFor(t.scalar_type());
}
TypeMeta typeMetaFor(at::ScalarType st) {
#define DEFINE_CASE(ctype,aten_name) \
case at::k##aten_name: \
return TypeMeta::Make<ctype>();
switch(st) {
AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CASE)
default:
CAFFE_THROW("Unknown ATen Type");
}
#undef DEFINE_CASE
}
at::TensorOptions optionsFor(const Tensor& ten) {
at::Device device = ten.GetDevice();
#if defined(USE_ROCM)
if (backend() == at::Backend::HIP) {
device = at::Device(kCUDA, device.index());
}
#endif
return at::TensorOptions(device).dtype(ten.dtype());
}
at::Tensor tensorWrapping(const Tensor& ten_) {
auto& ten = const_cast<Tensor&>(ten_);
return at::from_blob(
ten.raw_mutable_data(),
ten.sizes(),
optionsFor(ten));
}
at::Tensor peek(size_t i, size_t N) {
auto real_idx = InputSize() - N + i;
return tensorWrapping(Input(real_idx));
}
std::vector<at::Tensor> peekSlice(size_t i, size_t len, size_t N) {
std::vector<at::Tensor> results;
results.reserve(len);
for (size_t ii = i; ii < i + len; ++ii) {
results.push_back(peek(ii, N));
}
return results;
}
torch::List<std::optional<at::Tensor>> peekSliceOptionals(size_t i, size_t len, size_t N) {
torch::List<std::optional<at::Tensor>> results;
results.reserve(len);
for (size_t ii = i; ii < i + len; ++ii) {
results.push_back(peek(ii, N));
}
return results;
}
void assignTo(Tensor* dst, const at::Tensor& src_) {
at::Tensor src = src_.contiguous();
auto at_sizes = src.sizes();
caffe2::TypeMeta type_meta = typeMetaFor(src);
at::Device device = src.device();
#if defined(USE_ROCM)
if (device.is_cuda()) {
device = at::Device(at::DeviceType::HIP, device.index());
}
#endif
at::TensorImpl* src_impl = src.unsafeReleaseTensorImpl();
std::vector<int64_t> dims(at_sizes.begin(), at_sizes.end());
dst->Resize(dims);
dst->ShareExternalPointer(
at::DataPtr(
src_impl->mutable_data(),
static_cast<void*>(src_impl),
[](void* t_ptr) -> void {
at::TensorImpl* local_impl = static_cast<at::TensorImpl*>(t_ptr);
c10::raw::intrusive_ptr::decref(local_impl);
},
device),
type_meta,
0);
}
void assignListStartingAt(
size_t offset,
const std::vector<at::Tensor>& tensors) {
for (const auto i : c10::irange(tensors.size())) {
assignTo(Output(offset + i), tensors[i]);
}
}
template<typename T,
typename std::enable_if<std::numeric_limits<T>::is_integer, bool>::type* =
nullptr>
int64_t extract(const at::Scalar &s) {
return s.toLong();
}
template<typename T,
typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type* =
nullptr>
int64_t extract(const at::Scalar &s) {
return s.toDouble();
}
void assignTo(Tensor* dst, at::ScalarType scalar_type, const at::Scalar& scalar) {
switch(scalar_type) {
#define DEFINE_CASE(ctype,aten_name) \
case at::k##aten_name: { \
auto value = extract<ctype>(scalar); \
assignToValue<ctype>(dst, at::convert<ctype,decltype(value)>(value)); \
} break;
AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CASE)
#undef DEFINE_CASE
default:
CAFFE_THROW("Unknown ATen Type");
}
}
template <typename T>
void assignToValue(Tensor* dst, T v) {
dst->Resize(std::vector<int64_t>());
math::Set(1, v, dst->template mutable_data<T>(), &context_);
}
int findImplementation(const OperatorDef& operator_def) {
CAFFE_ENFORCE(HasArgument("operator"));
std::string op = OperatorBase::GetSingleArgument<std::string>("operator", "");
// construct descriptor string ([DESCRIPTORS]) given the attributes
// and inputs of this operator_def, and look up the implementation key
// for this variant
std::stringstream descriptor;
descriptor << op;
std::vector<std::string> attrs;
for (const auto i : c10::irange(operator_def.arg_size())) {
auto & attr = operator_def.arg(i);
if (attr.name() == "operator" || attr.name() == "type" || attr.name() == "overload_name") {
continue;
}
attrs.push_back(attr.name());
}
std::sort(attrs.begin(), attrs.end());
for(auto & a : attrs)
descriptor << "-" << a;
std::string descriptor_sized =
descriptor.str() + "-" + c10::to_string(InputSize());
std::string descriptor_var_args = descriptor.str() + "-*";
if (op_to_key.count(descriptor_sized) > 0) {
return op_to_key[descriptor_sized];
}
if (op_to_key.count(descriptor_var_args) > 0) {
return op_to_key[descriptor_var_args];
}
std::stringstream ss;
ss << "Attempting to run unknown ATen operator configuration: "
<< descriptor_sized;
CAFFE_THROW(ss.str());
}
at::Scalar readScalarAttribute(const std::string & name) {
if(OperatorBase::HasSingleArgumentOfType<int64_t>(name)) {
return OperatorBase::GetSingleArgument<int64_t>(name, 0);
} else {
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<float>(name));
return OperatorBase::GetSingleArgument<float>(name, 0);
}
}
template<typename T>
T readAttribute(const std::string & name) {
CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>(name));
return OperatorBase::GetSingleArgument<T>(name, 0);
}
std::vector<int64_t> readIntArrayRef(const std::string & name) {
CAFFE_ENFORCE(OperatorBase::HasArgument(name));
return OperatorBase::GetRepeatedArgument<int64_t>(name, {});
}
template <int N>
std::array<bool, N> readBoolMask(const std::string& name) {
CAFFE_ENFORCE(OperatorBase::HasArgument(name));
std::vector<int64_t> ints =
OperatorBase::GetRepeatedArgument<int64_t>(name, {});
std::array<bool, N> result;
for (const auto i : c10::irange(N)) {
result[i] = ints.at(i);
}
return result;
}
${implementations}
};
}

View File

@ -1,131 +0,0 @@
from caffe2.python import core
from hypothesis import given
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
import numpy as np
class TestATen(hu.HypothesisTestCase):
@given(inputs=hu.tensors(n=2), **hu.gcs)
def test_add(self, inputs, gc, dc):
op = core.CreateOperator(
"ATen",
["X", "Y"],
["Z"],
operator="add")
def ref(X, Y):
return [X + Y]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(inputs=hu.tensors(n=2, dtype=np.float16), **hu.gcs_gpu_only)
def test_add_half(self, inputs, gc, dc):
op = core.CreateOperator(
"ATen",
["X", "Y"],
["Z"],
operator="add")
def ref(X, Y):
return [X + Y]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(inputs=hu.tensors(n=1), **hu.gcs)
def test_pow(self, inputs, gc, dc):
op = core.CreateOperator(
"ATen",
["S"],
["Z"],
operator="pow", exponent=2.0)
def ref(X):
return [np.square(X)]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
def test_sort(self, x, gc, dc):
inputs = [np.random.permutation(x)]
op = core.CreateOperator(
"ATen",
["S"],
["Z", "I"],
operator="sort")
def ref(X):
return [np.sort(X), np.argsort(X)]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(inputs=hu.tensors(n=1), **hu.gcs)
def test_sum(self, inputs, gc, dc):
op = core.CreateOperator(
"ATen",
["S"],
["Z"],
operator="sum")
def ref(X):
return [np.sum(X)]
self.assertReferenceChecks(gc, op, inputs, ref)
@given(**hu.gcs)
def test_index_uint8(self, gc, dc):
# Indexing with uint8 is deprecated, but we need to provide backward compatibility for some old models exported through ONNX
op = core.CreateOperator(
"ATen",
['self', 'mask'],
["Z"],
operator="index")
def ref(self, mask):
return (self[mask.astype(np.bool_)],)
tensor = np.random.randn(2, 3, 4).astype(np.float32)
mask = np.array([[1, 0, 0], [1, 1, 0]]).astype(np.uint8)
self.assertReferenceChecks(gc, op, [tensor, mask], ref)
@given(**hu.gcs)
def test_index_put(self, gc, dc):
op = core.CreateOperator(
"ATen",
['self', 'indices', 'values'],
["Z"],
operator="index_put")
def ref(self, indices, values):
self[indices] = values
return (self,)
tensor = np.random.randn(3, 3).astype(np.float32)
mask = np.array([[True, True, True], [True, False, False], [True, True, False]])
values = np.random.randn(6).astype(np.float32)
self.assertReferenceChecks(gc, op, [tensor, mask, values], ref)
@given(**hu.gcs)
def test_unique(self, gc, dc):
op = core.CreateOperator(
"ATen",
['self'],
["output"],
sorted=True,
return_inverse=True,
# return_counts=False,
operator="_unique")
def ref(self):
index, _ = np.unique(self, return_index=False, return_inverse=True, return_counts=False)
return (index,)
tensor = np.array([1, 2, 6, 4, 2, 3, 2])
print(ref(tensor))
self.assertReferenceChecks(gc, op, [tensor], ref)
if __name__ == "__main__":
import unittest
unittest.main()

View File

@ -1,157 +0,0 @@
# Using ONNX and ATen to export models from PyTorch to Caffe2
When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up
hitting operators that are not yet part of the ONNX specification. These may be
operators that haven't been standardized yet, or custom `torch.autograd.Function` types that
are specific to a network.
To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
[ATen](https://github.com/pytorch/pytorch/tree/main/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/pytorch/pytorch/tree/main/caffe2/contrib/aten)
that can run these tensor functions in a Caffe2 network after importing them through ONNX.
This guide explains how to configure Caffe2 and modify your PyTorch program to use
this functionality.
### Enable ATen in Caffe2
The ATen facility in Caffe2 is part of a contrib package and needs to be enabled
when you configure Caffe2 using cmake:
```
git clone https://github.com/caffe2/caffe2/
mkdir caffe2/build
cd caffe2/build
cmake -DUSE_ATEN=ON <other build options> ..
make install
```
### Describe How to Export a PyTorch Autograd Function using ATen
To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run
in the forward pass of a network. For each function in the trace, it calls that function's
`symbolic` method which describes how to construct the part of the ONNX graph
that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/main/torch/autograd/_functions/basic_ops.py#L59) for examples).
When equivalent ONNX operators do not exist, you can instead call any ATen function.
As an example let's assume we have an autograd function which computes `x*x+y`:
```
class MyFunction(Function):
@staticmethod
def forward(ctx, x, y):
return x*x + y
```
We can add a `symbolic` method to it like so:
```
class MyFunction(Function):
@staticmethod
def forward(ctx, x, y):
return x*x + y
@staticmethod
def symbolic(graph, x, y):
x2 = graph.at("mul", x, x)
r = graph.at("add", x2, y)
# x, y, x2, and r are 'Node' objects
# print(r) or print(graph) will print out a textual representation for debugging.
# this representation will be converted to ONNX protobufs on export.
return r
```
The function `graph.at` adds a new ATen op the computation graph.
You can call any ATen function using this facility. To do so,
first identify a function in ATen you want to call in Functions.h,
Tensor.h, or Type.h.
As an example, we might want to call the `pow` operator:
```
static inline Tensor pow(const Tensor & self, Scalar exponent);
```
We can translate this into the equivalent `graph.at` function:
```
def symbolic(graph, x):
graph.at("pow", x, exponent_f = 2.0) # compute x**2
```
Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar`
like `exponent` becomes a keyword argument that specify ONNX attributes.
Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings).
For methods, the first input is always the `this` Tensor in C++.
To call methods of ATen's `Type` objects, you provide an additional string attribute
that determines the type. For instance, `ones` creates a new constant tensor of all ones:
```
class Type {
...
virtual Tensor ones(IntArrayRef size) const;
...
};
```
From PyTorch it can be created by adding the type as an additional attribute:
```
def symbolic(graph, x):
return graph.at("ones", type_s="float", size_i=[2,4])
```
Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
## Putting it together
With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`:
```
class MyModule(nn.Module):
def forward(self, x, y):
# you can combine your ATen ops with standard onnx ones
x = nn.ReLU()(x)
return MyFunction.apply(x, y)
torch.onnx.export(MyModule(),
(Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
"output.onnx",
verbose=True)
```
This exports the following graph, which contains calls the `ATen` operator:
```
graph(%1 : Float(3, 4)
%2 : Float(3, 4)) {
%3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
%4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
%5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
return (%5);
}
```
The graph can then be imported using ONNX and run with Caffe2:
```
import onnx
import caffe2.python.onnx.backend
import numpy as np
graph = onnx.load("output.onnx")
a = np.random.randn(3, 2).astype(np.float32)
b = np.random.randn(3, 2).astype(np.float32)
prepared_backend = caffe2.python.onnx.backend.prepare(graph)
W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
c2_out = prepared_backend.run(W)[0]
x = np.maximum(a, 0)
r = x*x + b
np.testing.assert_array_almost_equal(r, c2_out)
```
### Code
For the full source code for this tutorial, see [sample.py](sample.py).

View File

@ -1,56 +0,0 @@
import tempfile
import numpy as np
from torch import nn
from torch.autograd import Variable, Function
import torch.onnx
import onnx
import caffe2.python.onnx.backend
class MyFunction(Function):
@staticmethod
def forward(ctx, x, y):
return x * x + y
@staticmethod
def symbolic(graph, x, y):
x2 = graph.at("mul", x, x)
r = graph.at("add", x2, y)
# x, y, x2, and r are 'Node' objects
# print(r) or print(graph) will print out a textual representation for debugging.
# this representation will be converted to ONNX protobufs on export.
return r
class MyModule(nn.Module):
def forward(self, x, y):
# you can combine your ATen ops with standard onnx ones
x = nn.ReLU()(x)
return MyFunction.apply(x, y)
f = tempfile.NamedTemporaryFile()
torch.onnx.export(MyModule(),
(Variable(torch.ones(3, 4)), Variable(torch.ones(3, 4))),
f, verbose=True)
# prints the graph for debugging:
# graph(%input : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu),
# %y : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu)):
# %2 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = onnx::Relu(%input)
# %3 : Tensor = aten::ATen[operator="mul"](%2, %2)
# %4 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = aten::ATen[operator="add"](%3, %y)
# return (%4)
graph = onnx.load(f.name)
a = np.random.randn(3, 4).astype(np.float32)
b = np.random.randn(3, 4).astype(np.float32)
prepared_backend = caffe2.python.onnx.backend.prepare(graph)
W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
c2_out = prepared_backend.run(W)[0]
x = np.maximum(a, 0)
r = x * x + b
np.testing.assert_array_almost_equal(r, c2_out)

View File

@ -1,330 +0,0 @@
#!/bin/env python3
# Copyright (c) 2016-present, Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
import sys
import yaml
import argparse
import os
from copy import deepcopy
from typing import Dict, List, Set
parser = argparse.ArgumentParser()
parser.add_argument("--template_dir", default=".", help="where template.h is")
parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen",
help="where ATen yaml files are")
parser.add_argument("--output_prefix", default="", help="")
parser.add_argument(
"--install_dir", default=".", help="where to put generated file")
parser.add_argument("--aten_root", default="", help="root directory of aten")
args, _ = parser.parse_known_args()
if args.aten_root:
if not os.path.exists(args.aten_root):
raise ValueError('aten_root ({}) does not exist'.format(
args.aten_root))
sys.path.insert(0, os.path.join(args.aten_root, '..'))
from torchgen.code_template import CodeTemplate as CT
else:
from torchgen.code_template import CodeTemplate as CT
OP_TEMPLATE = CT.from_file(
os.path.join(args.template_dir, 'aten_op_template.h'))
try:
# use faster C loader if available
from yaml import CSafeLoader as Loader
except ImportError:
from yaml import SafeLoader as Loader # type: ignore[assignment, misc]
def write(filename, s):
with open(filename, "w") as f:
f.write(s)
def read(filename):
with open(filename, "r") as f:
return f.read()
def value_has_tensors(v):
# Sparse shouldn't appear in public API, seems to be temporary bug
return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type']
def value_is_tensor_type(v):
return value_has_tensors(v) and v['dynamic_type'] not in TENSORLIST_TYPE
TENSORLIST_TYPE = [
'at::TensorList',
'const at::ITensorListRef &',
'const c10::List<::std::optional<at::Tensor>> &',
]
# for each aten type, how do we handle a return value of that type?
RETURN_MAP = {
'at::Tensor': 'assignTo(Output(${offset}),${output});',
'at::Scalar': 'assignTo(Output(${offset}),${output}.type(), ${output});',
'bool': 'assignToValue<int64_t>(Output(${offset}),${output});',
'int64_t': 'assignToValue<int64_t>(Output(${offset}),${output});',
'::std::vector<at::Tensor>': 'assignListStartingAt(${offset}, ${output});',
}
# for each non-Tensor aten argument, how to we read it from caffe2's
# attribute list. Most of these call runtime functions defined in the
# template class.
ARGUMENT_MAP = {
'const at::Scalar &': 'at::Scalar ${arg} = readScalarAttribute("${arg}");',
'bool': 'bool ${arg} = readAttribute<int64_t>("${arg}");',
'int': 'int ${arg} = readAttribute<int64_t>("${arg}");',
'double': 'double ${arg} = readAttribute<float>("${arg}");',
'int64_t': 'int64_t ${arg} = readAttribute<int64_t>("${arg}");',
'at::IntArrayRef': 'auto ${arg} = readIntArrayRef("${arg}");',
'::std::array<bool,2>': 'auto ${arg} = readBoolMask<2>("${arg}");',
'::std::array<bool,3>': 'auto ${arg} = readBoolMask<3>("${arg}");',
}
# for BC reasons we want to route some of the functions to different
# implementations
SPECIAL_IMPLEMENTATIONS = {
'index': 'internal::index_with_uint8_handling',
}
def expand(o):
num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments'])
results = [o]
for i in range(0, num_defaults):
# last num_default values should be default
assert('default' in o['arguments'][-(i + 1)])
v = deepcopy(o)
v['arguments'] = v['arguments'][:-(i + 1)]
results.append(v)
return results
# filter the list of declarations removing things we cannot support
def supports(o, factory_methods):
# Ignore all families (!) of functions that have TensorOptions (i.e. tensor factory methods).
if o['name'] in factory_methods:
if factory_methods[o['name']] == 0:
print("Skipping {} because it is a factory method".format(o['name']))
factory_methods[o['name']] += 1
return False
# skip all in-place operators for now since aten cannot Resize
# caffe2 memory inside an operator
if o['inplace']:
return False
# _out variants also work in-place on arguments taken as destinations
# we also cannot handle these because aten cannot resize caffe2 Tensors
if "_out" in o['name']:
return False
# skip if no return, previously it is 'void'
if len(o['returns']) == 0:
return False
# skip return types we cannot handle
for ret in o['returns']:
if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP:
print("Skipping {} Because of Ret: {} ({})".format(
o['name'], ret['type'], ret['dynamic_type']))
return False
# skip arguments we cannot handle
for arg in o['arguments']:
if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP:
print("Skipping {} Because of Arg: {} ({}) ".format(
o['name'], arg['type'], arg['dynamic_type']))
return False
return True
# template for each potential operator.
# each operator has an integer 'key' associated with it, and
# a lambda that defines the operator
# non-tensor attributes are created in ${initialization}
# and then saved as arguments to the lambda
# Inputs/Outputs are read inside the lambda
#
# each implementation is defined in a separate method annotated with
# C10_NOINLINE to avoid inlining into the ATenOp constructor, which would
# trigger pathological compile times.
IMPLEMENTATION_TEMPLATE = CT("""\
C10_NOINLINE void implementation_${key}() { // ${name}
${initialization}
run_op = [=] {
at::AutoDispatchBelowAutograd guard;
${statements}
auto the_result = ${invocation};
${assignments}
return true;
};
}
""")
CASE_TEMPLATE = CT("""\
case ${key}: // ${name}
implementation_${key}();
break;
""")
ASSIGN_CHECK_SIZE_TEMPLATE = CT("""\
if(OutputSize() > ${offset}) {${assignment}}
""")
def get_output(o, i):
if len(o['returns']) == 1:
return 'the_result'
else:
return '::std::get<{}>(the_result)'.format(i)
def attribute_names(o):
return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)])
def required_attribute_names(o):
return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a])
def self_as_first_argument(arguments):
return ([a for a in arguments if a['name'] == 'self'] +
[a for a in arguments if a['name'] != 'self'])
def get_num_inputs(o):
args = 0
for a in o['arguments']:
if a['type'] in TENSORLIST_TYPE:
return '*'
elif value_has_tensors(a):
args += 1
return str(args)
def find_factory_methods(decls):
factory_methods = {}
for o in decls:
if any(arg['dynamic_type'] == 'at::TensorOptions' for arg in o['arguments']):
factory_methods[o['name']] = 0
return factory_methods
def emit_assignments(o, env):
for i, r in enumerate(o['returns']):
t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'at::Tensor']
assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
check_size_assignment = ASSIGN_CHECK_SIZE_TEMPLATE.substitute(env, offset=i, assignment=assignment)
env['assignments'].append(check_size_assignment)
if __name__ == '__main__':
decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
factory_methods = find_factory_methods(decls)
filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)]
top_env: Dict[str, List] = {
'mappings': [],
'implementations': [],
'cases': [],
}
seen: Set[str] = set()
key = 0
for o in filtered:
# [DESCRIPTORS]
# each option is associated with a descriptor string that is used
# to figure out which version of an op is being used:
# The format is:
# opname-num_inputs-attribute_1-attribute2
# Example:
# lerp-2-weight
# the operator lerp takes 2 arguments and has the attribute weight
attr_names = attribute_names(o)
num_inputs = get_num_inputs(o)
descriptor = '-'.join([o['name']] + attr_names + [num_inputs])
if descriptor in seen:
continue
seen.add(descriptor)
# map from descriptor string to the integer key in the switch statements
# that initializes the operators
top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key))
env = {
'name': o['name'],
'statements': [],
'arguments': [],
'assignments': [],
'initialization': [],
'key': str(key),
}
if 'namespace' not in o['method_of'] and 'Tensor' not in o['method_of']:
# methods on type like 'ones' or 'zeros' always take a
# string attribute that is translated into the at::Type object
# e.g. "Float" is at::kFloat
assert('Type' in o['method_of'])
static_tensor_inputs = sum(arg['type'] not in TENSORLIST_TYPE and value_is_tensor_type(arg) for arg in o['arguments'])
has_tensorlist = any(arg['type'] in TENSORLIST_TYPE for arg in o['arguments'])
if has_tensorlist:
tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] in TENSORLIST_TYPE][0]
real_inputs = 0
for i, arg in enumerate(o['arguments']):
env['arguments'].append(arg['name'])
# Pretend the flat argument list is a stack where the end is the top.
view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs
if arg['type'] == 'at::TensorList' or arg['type'] == 'const at::ITensorListRef &':
# NOTE: do not advance real_inputs here. After this we will
# switch to indexing the "stack" from the end
env['statements'].append(
'auto {} = peekSlice({}, InputSize() - {}, InputSize());'
.format(arg['name'], real_inputs, static_tensor_inputs))
elif arg['type'] == 'const c10::List<::std::optional<at::Tensor>> &':
# NOTE: do not advance real_inputs here. After this we will
# switch to indexing the "stack" from the end
env['statements'].append(
'auto {} = peekSliceOptionals({}, InputSize() - {}, InputSize());'
.format(arg['name'], real_inputs, static_tensor_inputs))
elif value_is_tensor_type(arg):
# load tensor inputs from Caffe2
env['statements'].append(
'auto {} = peek({}, {});'.format(arg['name'], real_inputs, view_length))
real_inputs += 1
else:
init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
env['initialization'].append(init)
emit_assignments(o, env)
if o['name'] in SPECIAL_IMPLEMENTATIONS:
env['invocation'] = "{}({})".format(SPECIAL_IMPLEMENTATIONS[o['name']], ','.join(env['arguments']))
elif 'namespace' in o['method_of']:
env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
else:
assert('Tensor' in o['method_of'])
env['invocation'] = "self.{}({})".format(
o['name'], ', '.join(env['arguments'][1:]))
top_env['implementations'].append(IMPLEMENTATION_TEMPLATE.substitute(env))
top_env['cases'].append(CASE_TEMPLATE.substitute(env))
key += 1
write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env))

View File

@ -1,35 +0,0 @@
if(USE_FAKELOWP)
message(STATUS "Including FakeLowP operators")
# ---[ CPU files.
file(GLOB_RECURSE tmp *.cc)
set(FAKELOWP_CPU_SRCS ${FAKELOWP_CPU_SRCS} ${tmp})
# exclude test files and gpu files
file(GLOB_RECURSE tmp *_test.cc)
exclude(FAKELOWP_CPU_SRCS "${FAKELOWP_CPU_SRCS}" ${tmp})
# We will only build the perf kernel files if the compiler supports avx2
# extensions.
if(CXX_AVX2_FOUND)
add_library(caffe2_fakelowp_ops OBJECT ${FAKELOWP_CPU_SRCS})
add_dependencies(caffe2_fakelowp_ops fbgemm cpuinfo Caffe2_PROTO c10 aten_op_header_gen)
target_include_directories(caffe2_fakelowp_ops BEFORE
PRIVATE $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}/include>)
target_include_directories(caffe2_fakelowp_ops BEFORE
PRIVATE $<BUILD_INTERFACE:${CPUINFO_SOURCE_DIR}/include>)
if(MSVC)
set_property(SOURCE ${FAKELOWP_CPU_SRCS}
APPEND_STRING PROPERTY COMPILE_FLAGS " /arch:AVX2 ")
else()
set_property(SOURCE ${FAKELOWP_CPU_SRCS}
APPEND_STRING PROPERTY COMPILE_FLAGS " -mavx2 -mfma -mf16c -mxsave ")
endif()
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
$<TARGET_OBJECTS:caffe2_fakelowp_ops>)
endif()
# ---[ Send the lists to the parent scope.
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
else()
message(STATUS "Excluding FakeLowP operators")
endif()

View File

@ -1,66 +0,0 @@
#include "batch_matmul_fp16_fake_op.h"
#include "caffe2/core/operator_schema.h"
namespace caffe2 {
vector<TensorShape> TensorInferenceForBatchMatMul(
const OperatorDef& def,
const vector<TensorShape>& in);
OpSchema::Cost CostInferenceForBatchMatMul(
const OperatorDef& def,
const vector<TensorShape>& in);
REGISTER_CPU_OPERATOR(BatchMatMulFP16Fake, BatchMatMulFP16FakeOp<CPUContext>);
OPERATOR_SCHEMA(BatchMatMulFP16Fake)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Batch Matrix multiplication Yi = Ai * Bi, where A has shape (dim0, dim1, ... M, K),
B has shape (dim0, dim1, ... K, N), Y has shape (dim0, dim1, ... M, N) and i ranges
from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B being
two diemnsional, it behaves like normal matrix multiplication.
)DOC")
.Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
.Input(1, "B", "tensor of shpae (dim0, dim2 ... K, N)")
.Output(0, "Y", "tensor of shape (dim0, dim1 ... M, N)")
.Arg(
"trans_a",
"Pass 1 to transpose the last two dimensions of A before "
"doing multiplication")
.Arg(
"trans_b",
"Pass 1 to transpose the last two dimensions of B before "
"doing multiplication")
.Arg(
"broadcast",
"Pass 1 to allow broadcasting of dimensions. Behavior is the same as numpy.matmul. Gradient is currently not supported when running in broadcast mode.")
.TensorInferenceFunction(TensorInferenceForBatchMatMul)
.CostInferenceFunction(
OpSchema::CostInferenceFunctionType(CostInferenceForBatchMatMul))
.InheritOnnxSchema();
REGISTER_CPU_OPERATOR(
BatchMatMulFP16Acc16Fake,
BatchMatMulFP16FakeOp<
CPUContext,
DefaultEngine,
true /*use custom fp16 gemm acc16*/,
false /*not using temp accmulator*/,
false /*use math fp16 gemm acc 32*/>);
OPERATOR_SCHEMA(BatchMatMulFP16Acc16Fake).NumInputs(2).NumOutputs(1);
REGISTER_CPU_OPERATOR(
BatchMatMulFP16Acc32Fake,
BatchMatMulFP16FakeOp<
CPUContext,
DefaultEngine,
false /*use custom fp16 gemm acc16*/,
false /*not using temp accmulator*/,
true /*use custom fp16 gemm acc32*/>);
OPERATOR_SCHEMA(BatchMatMulFP16Acc32Fake).NumInputs(2).NumOutputs(1);
} // namespace caffe2

View File

@ -1,440 +0,0 @@
#ifndef CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
#define CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
#include <ATen/Utils.h>
#include <c10/util/accumulate.h>
#include <fbgemm/FbgemmConvert.h>
#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include <algorithm>
#include <functional>
#include <numeric>
#include <string>
#include <vector>
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
template <
class Context,
class Engine = DefaultEngine,
bool USE_ACC_FP16 = false,
bool USE_TMP_ACCUMULATOR = false,
bool USE_CUSTOM_ACC32 =
false> /* if USE_ACC_FP16=false, set to true to use custom gemm kernel
in fp16_gemm_utils.cc instead of math.h gemm functions */
class BatchMatMulFP16FakeOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
template <class... Args>
explicit BatchMatMulFP16FakeOp(Args&&... args)
: Operator<Context>(std::forward<Args>(args)...),
OP_SINGLE_ARG(bool, "trans_a", trans_a_, false),
OP_SINGLE_ARG(bool, "trans_b", trans_b_, false),
OP_SINGLE_ARG(bool, "broadcast", broadcast_, false) {}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
}
template <typename T>
bool DoRunWithType() {
const auto& A = Input(0);
const auto& B = Input(1);
const int A_ndim = A.dim();
const int B_ndim = B.dim();
const std::vector<std::int64_t> A_dims = A.sizes().vec();
const std::vector<std::int64_t> B_dims = B.sizes().vec();
const T* A_data = A.template data<T>();
const T* B_data = B.template data<T>();
// Fake fp16 rounding of input
std::vector<float> A_rounded(A.numel());
std::vector<float> B_rounded(B.numel());
fbgemm::RoundToFloat16(
A_data,
A_rounded.data(),
A.numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
USE_ACC_FP16);
fbgemm::RoundToFloat16(
B_data,
B_rounded.data(),
B.numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
USE_ACC_FP16);
A_data = A_rounded.data();
B_data = B_rounded.data();
if (A_ndim == 1 && B_ndim == 1) {
CAFFE_ENFORCE_EQ(A.numel(), B.numel());
auto* Y = Output(0, {1}, at::dtype<T>());
T* Y_data = Y->template mutable_data<T>();
math::Dot<T, Context>(A.numel(), A_data, B_data, Y_data, &context_);
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(Y_data),
Y_data,
Y->numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
USE_ACC_FP16);
return true;
}
if (A_ndim == 1) {
const int N = A.numel();
if (trans_b_) {
CAFFE_ENFORCE_EQ(B_dims[B_ndim - 1], N);
} else {
CAFFE_ENFORCE_EQ(B_dims[B_ndim - 2], N);
}
std::vector<std::int64_t> Y_dims(B_ndim - 1);
if (trans_b_) {
std::copy_n(B_dims.cbegin(), B_ndim - 1, Y_dims.begin());
} else {
std::copy_n(B_dims.cbegin(), B_ndim - 2, Y_dims.begin());
Y_dims.back() = B_dims.back();
}
auto* Y = Output(0, Y_dims, at::dtype<T>());
T* Y_data = Y->template mutable_data<T>();
if (trans_b_) {
const int M = B.numel() / N;
caffe2::custom_fp16_gemv(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
CblasNoTrans,
M,
N,
1.0f,
B_data,
A_data,
0.0f,
Y_data,
&context_);
} else {
const int M = B_dims[B_ndim - 1];
const int batch_size = B.numel() / (M * N);
if (batch_size == 1) {
caffe2::custom_fp16_gemv(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
CblasTrans,
N,
M,
1.0f,
B_data,
A_data,
0.0f,
Y_data,
&context_);
} else {
caffe2::custom_fp16_gemm_strided_batched(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
CblasTrans,
CblasNoTrans,
batch_size,
M,
1,
N,
1.0f,
B_data,
M * N,
A_data,
0,
0.0f,
Y_data,
M,
&context_);
}
}
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(Y_data),
Y_data,
Y->numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
USE_ACC_FP16);
return true;
}
if (B_ndim == 1) {
const int N = B.numel();
if (trans_a_) {
CAFFE_ENFORCE_EQ(A_dims[A_ndim - 2], N);
} else {
CAFFE_ENFORCE_EQ(A_dims[A_ndim - 1], N);
}
const std::vector<std::int64_t> Y_dims(
A_dims.cbegin(), A_dims.cbegin() + A_ndim - 1);
auto* Y = Output(0, Y_dims, at::dtype<T>());
T* Y_data = Y->template mutable_data<T>();
if (trans_a_) {
const int M = A_dims[A_ndim - 1];
const int batch_size = A.numel() / (M * N);
if (batch_size == 1) {
caffe2::custom_fp16_gemv(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
CblasTrans,
N,
M,
1.0f,
A_data,
B_data,
0.0f,
Y_data,
&context_);
} else {
caffe2::custom_fp16_gemm_strided_batched(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
CblasTrans,
CblasNoTrans,
batch_size,
M,
1,
N,
1.0f,
A_data,
M * N,
B_data,
0,
0.0f,
Y_data,
M,
&context_);
}
} else {
const int M = A.numel() / N;
caffe2::custom_fp16_gemv(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
CblasNoTrans,
M,
N,
1.0f,
A_data,
B_data,
0.0f,
Y_data,
&context_);
}
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(Y_data),
Y_data,
Y->numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
const int M = trans_a_ ? A_dims[A_ndim - 1] : A_dims[A_ndim - 2];
const int K = trans_a_ ? A_dims[A_ndim - 2] : A_dims[A_ndim - 1];
if (trans_b_) {
CAFFE_ENFORCE_EQ(B_dims[B_ndim - 1], K);
} else {
CAFFE_ENFORCE_EQ(B_dims[B_ndim - 2], K);
}
const int N = trans_b_ ? B_dims[B_ndim - 2] : B_dims[B_ndim - 1];
const int ndim = std::max(A_ndim, B_ndim);
std::vector<std::int64_t> A_broadcast_dims(ndim);
std::vector<std::int64_t> B_broadcast_dims(ndim);
std::vector<std::int64_t> Y_broadcast_dims(ndim);
math::utils::ComputeBroadcastBinaryOpDims(
A_ndim - 2,
A_dims.data(),
B_ndim - 2,
B_dims.data(),
A_broadcast_dims.data(),
B_broadcast_dims.data(),
Y_broadcast_dims.data());
Y_broadcast_dims[ndim - 2] = M;
Y_broadcast_dims[ndim - 1] = N;
auto* Y = Output(0, Y_broadcast_dims, at::dtype<T>());
T* Y_data = Y->template mutable_data<T>();
const int batch_dim = ndim - 2;
const bool is_broadcast_dims = !std::equal(
A_broadcast_dims.cbegin(),
A_broadcast_dims.cbegin() + batch_dim,
B_broadcast_dims.cbegin());
if (is_broadcast_dims) {
CAFFE_ENFORCE(broadcast_);
}
const std::int64_t A_batch_size = c10::multiply_integers(
A_broadcast_dims.cbegin(),
A_broadcast_dims.cbegin() + batch_dim);
const std::int64_t B_batch_size = c10::multiply_integers(
B_broadcast_dims.cbegin(),
B_broadcast_dims.cbegin() + batch_dim);
const std::int64_t Y_batch_size = c10::multiply_integers(
Y_broadcast_dims.cbegin(),
Y_broadcast_dims.cbegin() + batch_dim);
if (Y_batch_size == 0) {
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(Y_data),
Y_data,
Y->numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
if (A_batch_size == 1 && B_batch_size == 1) {
if (USE_ACC_FP16) {
caffe2::custom_fp16_gemm_with_trans(
trans_a_ ? CblasTrans : CblasNoTrans,
trans_b_ ? CblasTrans : CblasNoTrans,
M,
K,
N,
A_data,
B_data,
0.0f,
Y_data,
true, /* use acc16*/
USE_TMP_ACCUMULATOR);
} else if (USE_CUSTOM_ACC32) {
caffe2::custom_fp16_gemm_with_trans(
trans_a_ ? CblasTrans : CblasNoTrans,
trans_b_ ? CblasTrans : CblasNoTrans,
M,
K,
N,
A_data,
B_data,
0.0f,
Y_data,
false, /* use acc32*/
USE_TMP_ACCUMULATOR);
} else {
math::Gemm<T, Context, Engine>(
trans_a_ ? CblasTrans : CblasNoTrans,
trans_b_ ? CblasTrans : CblasNoTrans,
M,
N,
K,
1.0f,
A_data,
B_data,
0.0f,
Y_data,
&context_);
}
} else if (A_batch_size == 1) {
caffe2::custom_fp16_gemm_strided_batched(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
trans_a_ ? CblasTrans : CblasNoTrans,
trans_b_ ? CblasTrans : CblasNoTrans,
Y_batch_size,
M,
N,
K,
1.0f,
A_data,
0,
B_data,
K * N,
0.0f,
Y_data,
M * N,
&context_);
} else if (B_batch_size == 1) {
caffe2::custom_fp16_gemm_strided_batched(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
trans_a_ ? CblasTrans : CblasNoTrans,
trans_b_ ? CblasTrans : CblasNoTrans,
Y_batch_size,
M,
N,
K,
1.0f,
A_data,
M * K,
B_data,
0,
0.0f,
Y_data,
M * N,
&context_);
} else if (!is_broadcast_dims) {
caffe2::custom_fp16_gemm_strided_batched(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
trans_a_ ? CblasTrans : CblasNoTrans,
trans_b_ ? CblasTrans : CblasNoTrans,
Y_batch_size,
M,
N,
K,
1.0f,
A_data,
M * K,
B_data,
K * N,
0.0f,
Y_data,
M * N,
&context_);
} else {
std::vector<const T*> A_ptr(Y_batch_size);
std::vector<const T*> B_ptr(Y_batch_size);
std::vector<T*> Y_ptr(Y_batch_size);
std::vector<std::int64_t> index(batch_dim);
for (std::int64_t i = 0; i < Y_batch_size; ++i) {
const std::int64_t A_index = math::utils::GetIndexFromDims(
batch_dim, A_broadcast_dims.data(), index.data());
const std::int64_t B_index = math::utils::GetIndexFromDims(
batch_dim, B_broadcast_dims.data(), index.data());
A_ptr[i] = A_data + A_index * M * K;
B_ptr[i] = B_data + B_index * K * N;
Y_ptr[i] = Y_data + i * M * N;
math::utils::IncreaseIndexInDims(
batch_dim, Y_broadcast_dims.data(), index.data());
}
caffe2::custom_fp16_gemm_batched(
USE_ACC_FP16,
USE_CUSTOM_ACC32,
USE_TMP_ACCUMULATOR,
trans_a_ ? CblasTrans : CblasNoTrans,
trans_b_ ? CblasTrans : CblasNoTrans,
Y_batch_size,
M,
N,
K,
1.0f,
A_ptr.data(),
B_ptr.data(),
0.0f,
Y_ptr.data(),
&context_);
}
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(Y_data),
Y_data,
Y->numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
private:
const bool trans_a_;
const bool trans_b_;
const bool broadcast_;
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_

View File

@ -1,5 +0,0 @@
#include "caffe2/core/init.h"
C10_DEFINE_bool(caffe2_fbgemm_fake_fp16_clamp, true, "");
C10_DEFINE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms, true, "");

View File

@ -1,5 +0,0 @@
#pragma once
namespace caffe2 {
} // namespace caffe2

View File

@ -1,102 +0,0 @@
#include <fbgemm/FbgemmConvert.h>
#include "caffe2/contrib/fakelowp/sum_fp16_fake_op.h"
#include "caffe2/operators/elementwise_add_op.h"
#include "caffe2/operators/elementwise_div_op.h"
#include "caffe2/operators/elementwise_mul_op.h"
#include "caffe2/operators/elementwise_sub_op.h"
#include "caffe2/operators/utility_ops.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
namespace {
int getSizeFromDims(const std::vector<int>& dims) {
int tot = 1;
for (auto i = 0; i < dims.size(); i++) {
tot *= dims[i];
}
return tot;
}
template <class Functor>
struct FP16PairWiseCPUFunctor {
template <typename TIn, typename TOut>
bool Forward(
const std::vector<int>& A_dims,
const std::vector<int>& B_dims,
const TIn* A,
const TIn* B,
TOut* C,
CPUContext* context) const {
functor.Forward(A_dims, B_dims, A, B, C, context);
return true;
}
template<>
bool Forward<float, float>(
const std::vector<int>& A_dims,
const std::vector<int>& B_dims,
const float* A,
const float* B,
float* C,
CPUContext* context) const {
auto A_sz = getSizeFromDims(A_dims);
auto B_sz = getSizeFromDims(B_dims);
std::vector<float> A_fp16(A_sz);
std::vector<float> B_fp16(B_sz);
fbgemm::RoundToFloat16(
A, A_fp16.data(), A_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
B, B_fp16.data(), B_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
functor.Forward(A_dims, B_dims, A_fp16.data(), B_fp16.data(), C, context);
fbgemm::RoundToFloat16(C, C, A_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
Functor functor;
};
} // namespace
REGISTER_CPU_OPERATOR(SumFakeFp16, SumFP16FP16AccOp<CPUContext>);
OPERATOR_SCHEMA(SumFakeFp16).NumInputs(1, INT_MAX).NumOutputs(1, INT_MAX);
REGISTER_CPU_OPERATOR(
AddFakeFp16,
BinaryElementwiseOp<
TensorTypes<float, int, long>,
CPUContext,
FP16PairWiseCPUFunctor<AddFunctor<CPUContext>>>);
OPERATOR_SCHEMA(AddFakeFp16).NumInputs(2).NumOutputs(1);
REGISTER_CPU_OPERATOR(
DivFakeFp16,
BinaryElementwiseOp<
TensorTypes<float, double>,
CPUContext,
FP16PairWiseCPUFunctor<DivFunctor<CPUContext>>>);
OPERATOR_SCHEMA(DivFakeFp16).NumInputs(2).NumOutputs(1);
REGISTER_CPU_OPERATOR(
MulFakeFp16,
BinaryElementwiseOp<
TensorTypes<float>,
CPUContext,
FP16PairWiseCPUFunctor<MulFunctor<CPUContext>>>);
OPERATOR_SCHEMA(MulFakeFp16).NumInputs(2).NumOutputs(1);
REGISTER_CPU_OPERATOR(
SubFakeFp16,
BinaryElementwiseOp<
TensorTypes<float>,
CPUContext,
FP16PairWiseCPUFunctor<SubFunctor<CPUContext>>>);
OPERATOR_SCHEMA(SubFakeFp16).NumInputs(2).NumOutputs(1);
} // namespace caffe2

View File

@ -1,110 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <functional>
#include "caffe2/contrib/fakelowp/fp16_fc_acc_op.h"
#include "caffe2/core/init.h"
#include "caffe2/core/tensor.h"
#include "caffe2/operators/fc_inference.h"
namespace caffe2 {
template <>
int Fp16FCAccOp<CPUContext, DefaultEngine, false>::runs = 0;
template <>
float Fp16FCAccOp<CPUContext, DefaultEngine, false>::total_error = 0.0;
template <>
float Fp16FCAccOp<CPUContext, DefaultEngine, false>::total_error_with_bias =
0.0;
template <>
int Fp16FCAccOp<CPUContext, DefaultEngine, true>::runs = 0;
template <>
float Fp16FCAccOp<CPUContext, DefaultEngine, true>::total_error = 0.0;
template <>
float Fp16FCAccOp<CPUContext, DefaultEngine, true>::total_error_with_bias = 0.0;
REGISTER_CPU_OPERATOR(
Fp16FCAcc32,
Fp16FCAccOp<
CPUContext,
DefaultEngine,
false /* USE_ACC_FP16 */,
true /* USE_TMP_ACCUMULATOR */,
false /* ADD_BIAS_FIRST */>);
using namespace std::placeholders;
OPERATOR_SCHEMA(Fp16FCAcc32)
.NumInputs(3)
.NumOutputs(1)
.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
.CostInferenceFunction(OpSchema::CostInferenceFunctionType(
std::bind(CostInferenceForFC, _1, _2, false)))
.SetDoc(R"DOC(Same as FC)DOC");
REGISTER_CPU_OPERATOR(
Fp16FCAcc16,
Fp16FCAccOp<
CPUContext,
DefaultEngine,
true /* USE_ACC_FP16 */,
true /* USE_TMP_ACCUMULATOR */,
false /* ADD_BIAS_FIRST */>);
OPERATOR_SCHEMA(Fp16FCAcc16)
.NumInputs(3)
.NumOutputs(1)
.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
.CostInferenceFunction(OpSchema::CostInferenceFunctionType(
std::bind(CostInferenceForFC, _1, _2, false)))
.SetDoc(R"DOC(Same as FC)DOC");
REGISTER_CPU_OPERATOR(
Fp16FCAcc32NNPI,
Fp16FCAccOp<
CPUContext,
DefaultEngine,
false /* USE_ACC_FP16 */,
false /* USE_TMP_ACCUMULATOR */,
true /* ADD_BIAS_FIRST */>);
OPERATOR_SCHEMA(Fp16FCAcc32NNPI)
.NumInputs(3)
.NumOutputs(1)
.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
.CostInferenceFunction(OpSchema::CostInferenceFunctionType(
std::bind(CostInferenceForFC, _1, _2, false)))
.SetDoc(R"DOC(Same as FC)DOC");
REGISTER_CPU_OPERATOR(
Fp16FCAcc16NNPI,
Fp16FCAccOp<
CPUContext,
DefaultEngine,
true /* USE_ACC_FP16 */,
false /* USE_TMP_ACCUMULATOR */,
true /* ADD_BIAS_FIRST */>);
OPERATOR_SCHEMA(Fp16FCAcc16NNPI)
.NumInputs(3)
.NumOutputs(1)
.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
.CostInferenceFunction(OpSchema::CostInferenceFunctionType(
std::bind(CostInferenceForFC, _1, _2, false)))
.SetDoc(R"DOC(Same as FC)DOC");
} // namespace caffe2

View File

@ -1,398 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <fbgemm/FbgemmConvert.h>
#include <fbgemm/FbgemmFP16.h>
#include <immintrin.h>
#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/utils/conversions.h"
#include "caffe2/utils/math.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
using namespace std;
// C2 wrapper for fp16 gemm with fp16 accumulation
template <
class Context,
class Engine = DefaultEngine,
bool USE_ACC_FP16 = false, // Whether use fp16 accumulation
bool USE_TMP_ACCUMULATOR = false,
bool ADD_BIAS_FIRST = false>
class Fp16FCAccOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
Fp16FCAccOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)) {}
~Fp16FCAccOp() noexcept override {
if (X_fp16_ != nullptr) {
delete[] X_fp16_;
}
if (W_fp16_ != nullptr) {
delete[] W_fp16_;
}
if (b_fp16_ != nullptr) {
delete[] b_fp16_;
}
if (bias_multiplier_fp16_ != nullptr) {
delete[] bias_multiplier_fp16_;
}
}
// template on X, B, W and Y.
template <typename T_X, typename T_B, typename T_W, typename T_Y>
bool DoRunWithType() {
const auto& X = Input(0);
const auto& W_blob = OperatorBase::InputBlob(1);
const auto& b = Input(2);
auto* Y = Output(0);
CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
// batch size
const auto canonical_axis = X.canonical_axis_index(axis_);
const int M = X.size_to_dim(canonical_axis);
const int N = b.size();
const int K = X.size_from_dim(canonical_axis);
Y_shape_cache_ = X.sizes().vec();
// This is an invariant of canonical_axis, so we can DCHECK.
TORCH_DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
Y_shape_cache_.resize(canonical_axis + 1);
Y_shape_cache_[canonical_axis] = N;
Y->Resize(Y_shape_cache_);
if (X.size() == 0) {
// skip the rest of the computation if X is empty
Y->template mutable_data<T_Y>();
return true;
}
// Convert X and W to fp16
int X_size = M * K;
int W_size = N * K;
if (X_fp16_ == nullptr) {
X_fp16_ = new float[X_size];
X_size_cached_ = X_size;
} else if (X_size > X_size_cached_) {
delete[] X_fp16_;
X_fp16_ = new float[X_size];
X_size_cached_ = X_size;
}
fbgemm::RoundToFloat16(
X.template data<T_X>(),
X_fp16_,
X_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
if (W_fp16_ == nullptr) {
W_fp16_ = new float[W_size];
const T_W* W_data = nullptr;
if (W_blob.template IsType<
caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
auto* W_fbgemm =
OperatorBase::Input<
caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>(1)
.get();
if (!W_fbgemm->packed()) {
float* W_fp16_trans = new float[W_size];
fbgemm::Float16ToFloat_avx2(W_fbgemm->pmat(), W_fp16_trans, W_size);
for (const auto i : c10::irange(N)) {
for (const auto j : c10::irange(K)) {
W_fp16_[j * N + i] = W_fp16_trans[i * K + j];
}
}
delete[] W_fp16_trans;
} else {
vector<fbgemm::float16> unpacked_mat;
unpacked_mat.resize(W_size);
W_fbgemm->unpack(
unpacked_mat.data(), fbgemm::matrix_op_t::NoTranspose);
fbgemm::Float16ToFloat_avx2(unpacked_mat.data(), W_fp16_, W_size);
}
} else {
const auto& W = Input(1);
W_data = W.template data<T_W>();
// Transpose W
for (const auto i : c10::irange(N)) {
for (const auto j : c10::irange(K)) {
W_fp16_[j * N + i] = W_data[i * K + j];
}
}
}
fbgemm::RoundToFloat16(
W_fp16_, W_fp16_, W_size, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
auto Y_data = Y->template mutable_data<T_Y>();
int Y_size = M * N;
// Initialize Y
memset(Y_data, 0.0, sizeof(float) * Y_size);
// Add bias term, accumulation is in fp16.
if (bias_multiplier_.size() != M) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Resize(M);
math::Set<T_B, Context>(
M,
convert::To<float, T_B>(1),
bias_multiplier_.template mutable_data<T_B>(),
&context_);
}
if (bias_multiplier_fp16_ == nullptr) {
bias_multiplier_fp16_ = new float[M];
M_cached_ = M;
} else if (M > M_cached_) {
delete[] bias_multiplier_fp16_;
bias_multiplier_fp16_ = new float[M];
M_cached_ = M;
}
fbgemm::RoundToFloat16(
bias_multiplier_.template data<T_B>(),
bias_multiplier_fp16_,
M,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
if (b_fp16_ == nullptr) {
b_fp16_ = new float[N];
}
fbgemm::RoundToFloat16(
b.template data<T_B>(),
b_fp16_,
N,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
if (ADD_BIAS_FIRST) {
custom_fp16_gemm(
M,
1,
N,
bias_multiplier_fp16_,
b_fp16_,
0.f,
Y->template mutable_data<T_Y>(),
USE_ACC_FP16,
USE_TMP_ACCUMULATOR);
#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
float* Y_ref = new float[M * N]();
TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
math::Gemm<T_B, Context, Engine>(
CblasNoTrans,
CblasNoTrans,
M,
N,
1,
1,
bias_multiplier_.template data<T_B>(),
b.template data<T_B>(),
0.f,
Y_ref,
&context_,
math_type);
relative_error =
compute_relative_error(Y->template mutable_data<T_Y>(), Y_ref, M * N);
total_error_with_bias += relative_error;
VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
<< "Relative error for Y = bias_multiplier_ * b' = " << relative_error
<< ", average error with bias after " << runs
<< " runs = " << total_error_with_bias / runs << endl;
#endif
custom_fp16_gemm(
M,
K,
N,
X_fp16_,
W_fp16_,
1.f,
Y->template mutable_data<T_Y>(),
USE_ACC_FP16,
USE_TMP_ACCUMULATOR);
#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
if (!W_blob.IsType<caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
const auto& W = Input(1);
math::Gemm<float, Context, Engine>(
CblasNoTrans,
CblasTrans,
M,
N,
K,
1,
X.template data<T_X>(),
W.template data<T_W>(),
1.f,
Y_ref,
&context_,
math_type);
runs++;
float relative_error = compute_relative_error(
Y->template mutable_data<T_Y>(), Y_ref, M * N);
total_error += relative_error;
VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
<< "Relative error for Y = bias_multiplier_ * b' + X * W' = "
<< relative_error << ", average error after " << runs
<< " runs = " << total_error / runs << endl;
if (Y_ref != nullptr) {
delete[] Y_ref;
}
}
#endif
} else {
custom_fp16_gemm(
M,
K,
N,
X_fp16_,
W_fp16_,
0.f,
Y->template mutable_data<T_Y>(),
USE_ACC_FP16,
USE_TMP_ACCUMULATOR);
#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
if (!W_blob.IsType<caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
const auto& W = Input(1);
float* Y_ref = new float[M * N]();
TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
math::Gemm<float, Context, Engine>(
CblasNoTrans,
CblasTrans,
M,
N,
K,
1,
X.template data<T_X>(),
W.template data<T_W>(),
0.f,
Y_ref,
&context_,
math_type);
runs++;
float relative_error = compute_relative_error(
Y->template mutable_data<T_Y>(), Y_ref, M * N);
total_error += relative_error;
VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
<< "Relative error for Y = X * W' = " << relative_error
<< ", average error after " << runs
<< " runs = " << total_error / runs << endl;
}
#endif
custom_fp16_gemm(
M,
1,
N,
bias_multiplier_fp16_,
b_fp16_,
1.f,
Y->template mutable_data<T_Y>(),
USE_ACC_FP16,
USE_TMP_ACCUMULATOR);
#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
math::Gemm<T_B, Context, Engine>(
CblasNoTrans,
CblasNoTrans,
M,
N,
1,
1,
bias_multiplier_.template data<T_B>(),
b.template data<T_B>(),
1,
Y_ref,
&context_,
math_type);
relative_error =
compute_relative_error(Y->template mutable_data<T_Y>(), Y_ref, M * N);
total_error_with_bias += relative_error;
VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
<< "Relative error for Y = X * W' + bias_multiplier_ * b' = "
<< relative_error << ", average error with bias after " << runs
<< " runs = " << total_error_with_bias / runs << endl;
if (Y_ref != nullptr) {
delete[] Y_ref;
}
#endif
}
return true;
}
#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
float compute_L2_norm(float* A, int size) {
float square_sum = 0.0;
for (const auto i : c10::irange(size)) {
square_sum += A[i] * A[i];
}
return std::sqrt(square_sum);
}
float compute_relative_error(float* A, float* A_ref, int size) {
float error = 0.0;
for (const auto i : c10::irange(size)) {
error += (A[i] - A_ref[i]) * (A[i] - A_ref[i]);
}
error = std::sqrt(error);
float L2_norm = compute_L2_norm(A, size);
return error / L2_norm;
}
#endif
bool RunOnDevice() override {
return DoRunWithType<
float, // X
float, // B
float, // W
float>(); // Y
}
protected:
size_t axis_{1};
size_t axis_w_{1};
size_t X_size_cached_{0};
size_t M_cached_{0};
static int runs;
static float total_error;
static float total_error_with_bias;
float* X_fp16_ = nullptr;
float* W_fp16_ = nullptr;
float* b_fp16_ = nullptr;
float* bias_multiplier_fp16_ = nullptr;
// A local vector to cache the output shape so we don't need to recreate
// a vector object every time we run Run().
vector<int64_t> Y_shape_cache_;
Tensor bias_multiplier_{Context::GetDeviceType()};
};
} // namespace caffe2

View File

@ -1,129 +0,0 @@
#include "fp16_fma.h"
#include <immintrin.h>
#include <cmath>
#include <cstdint>
namespace fake_fp16 {
// Compute fp16 FMA using fp16
// Out = FMA (A, B, Out)
//
// Algorithm:
// Do an FMA in fp64
// Since fp16 has 10 bits of mantissa and fp64 has 52, zero out
// 42 bits.
// Extract the exponent.
// If the exponent ends up in the subnormal range, shift out
// only 42 - (14 + exponent).
// Compute the bounce value as a value that is big enough to
// push all the digits except for the required ones in fp16,
// the objective is to push digits to let the machine do rounding.
// Add 42 or the computed number (in case of denormals) to the exponent.
// For negative numbers set the highest bit of the mantissa to 1.
void fma_fp16(int N, const float* A, const float* B, float* Out) {
constexpr int blockSize = 4;
constexpr uint64_t mask = 0x7ff0000000000000;
constexpr uint64_t shift_bits = 52;
constexpr uint64_t offset = 1023;
constexpr uint64_t dbl_threehalf = 0x3ff8000000000000;
uint64_t expo_bouncer;
// It can be proven than in the absence of intermediate overflow
// the desired numerical result can be obtained even with the
// possibility of a double rounding, as follow.
// round-to-fp16-precision( (double)A * (double)B + (double)C )
// This statement is not proved here; but we explain how to round a fp64
// number into fp16 precision using the technique of a "Bouncer"
// Suppose a numerical value in fp64 has exponent value of E
// If -14 <= E <= 15 (the fp16 exponent value for normalized number),
// the lsb of this value in fp16 precision is 2^(E-10).
// Now consider this fp64 number Bouncer which is 2^(52+(E-10)) * 3/2
// The lsb of Bouncer is (by design) 2^(E-10). Because Bouncer is
// is very much bigger than the fp16 value, denoted by say x,
// 2^(52+(E-10)) < Bouncer + x < 2^(53+(E-10))
// Thus TMP := Bouncer + x in double precision forces x to be rounded off
// at the lsb position of 2^(E-10).
// Consequently, the subtraction yields the desired result
// x_fp16_precision := TMP - Bouncer;
// If E < -14, we are dealing with the subnormal number range, there the lsb
// of fp16 precision is FIXED at 2^(-24) (definition of fp16).
// Hence the Bouncer is set at 2^(52-24) = 2^(28)
int n = 0;
for (; n + blockSize < N; n += blockSize) {
__m256d mA = _mm256_cvtps_pd(_mm_loadu_ps(A + n));
__m256d mB = _mm256_cvtps_pd(_mm_loadu_ps(B + n));
__m256d mOut = _mm256_cvtps_pd(_mm_loadu_ps(Out + n));
mOut = _mm256_fmadd_pd(mA, mB, mOut);
__m256i mExpv =
_mm256_and_si256(_mm256_castpd_si256(mOut), _mm256_set1_epi64x(mask));
mExpv = _mm256_srli_epi64(mExpv, shift_bits);
mExpv = _mm256_sub_epi64(mExpv, _mm256_set1_epi64x(offset));
__m256i cmp = _mm256_cmpgt_epi64(_mm256_set1_epi64x(-14), mExpv);
__m256i mExpoBouncer = _mm256_and_si256(cmp, _mm256_set1_epi64x(28));
mExpoBouncer = _mm256_or_si256(
mExpoBouncer,
_mm256_andnot_si256(
cmp, _mm256_add_epi64(_mm256_set1_epi64x(42), mExpv)));
__m256i mBouncer = _mm256_add_epi64(
_mm256_set1_epi64x(dbl_threehalf),
_mm256_slli_epi64(mExpoBouncer, shift_bits));
mOut = _mm256_sub_pd(
_mm256_add_pd(_mm256_castsi256_pd(mBouncer), mOut),
_mm256_castsi256_pd(mBouncer));
_mm_storeu_ps(Out + n, _mm256_cvtpd_ps(mOut));
}
// Epilogue
for (; n < N; n++) {
typedef union {
uint64_t I;
double F;
} flint64;
flint64 A_, B_, Out_, Bouncer;
A_.F = A[n];
B_.F = B[n];
Out_.F = Out[n];
// This is FMA in FP64
Out_.F = std::fma(A_.F, B_.F, Out_.F);
// We now round Out_.F to fp16 precision using a Bouncer
// First, figure out the exponent value E of Out_.F
int64_t expv = ((Out_.I & mask) >> shift_bits) - offset;
// Second: create the Bouncer. To do that, we
// first compute its exponent and then add that exponent value
// to the exponent field of the constant 3/2.
if (expv < -14) {
expo_bouncer = 28;
} else {
expo_bouncer = 42 + expv;
}
Bouncer.I = dbl_threehalf + (expo_bouncer << shift_bits);
// This is rounding to fp16 precision; add and subtract Bouncer
Out_.F = (Bouncer.F + Out_.F) - Bouncer.F;
Out[n] = Out_.F;
}
}
float fmafp32_avx_emulation(float v1, float v2, float v3) {
__m256 v1Vec = _mm256_set1_ps(v1);
__m256 v2Vec = _mm256_set1_ps(v2);
__m256 v3Vec = _mm256_set1_ps(v3);
__m256 resVec = _mm256_fmadd_ps(v1Vec, v2Vec, v3Vec);
float *result = (float *)&resVec;
return *result;
}
} // namespace fake_fp16

View File

@ -1,16 +0,0 @@
#pragma once
#include <glog/logging.h>
namespace fake_fp16 {
// Compute FMA using fp16 accumulation
// Out = FMA (A, B, Out)
void fma_fp16(int N, const float* A, const float* B, float* Out);
void fma_fp16_slow(int N, const float* A, const float* B, float* Out);
float fma_fp16_slow(const float A, const float B, float Out);
float fmafp32_avx_emulation(float v1, float v2, float v3);
} // namespace fake_fp16

View File

@ -1,540 +0,0 @@
#include <immintrin.h>
#include "fp16_fma.h"
namespace fp16_fma {
typedef int int16;
typedef char int8;
typedef unsigned short int bits16;
typedef unsigned int bits32;
typedef signed char Word8;
typedef unsigned char UWord8;
typedef signed short Word16;
typedef unsigned short UWord16;
typedef signed int Word32;
typedef unsigned int UWord32;
typedef long long Word64;
typedef unsigned long long UWord64;
typedef unsigned short float16;
typedef signed int sbits32;
typedef signed short int sbits16;
typedef char flag;
#define MAX_U32 (UWord32)0xffffffffL
#define MAX_U16 (UWord16)0xffff
#define BITMASK_T(typ, w) (((typ)1 << (w)) - 1)
#define TESTBIT(x, n) (((x) >> (n)) & 1)
#define float16_default_nan 0x7E00
#define float16_default_nan_pos 0x7E00
#define float16_default_nan_neg 0xFE00
int8 float_exception_flags = 0;
enum {
float_round_nearest_even = 0,
float_round_down = 1,
float_round_up = 2,
float_round_to_zero = 3
};
int8 float_rounding_mode = float_round_nearest_even;
enum { float_tininess_after_rounding = 0, float_tininess_before_rounding = 1 };
int float_detect_tininess = float_tininess_after_rounding;
inline bits16 extractFloat16Frac(float16 a) {
return a & 0x3FF;
}
inline int16 extractFloat16Exp(float16 a) {
return (a >> 10) & 0x1F;
}
inline flag extractFloat16Sign(float16 a) {
return a >> 15;
}
flag float16_is_quiet_nan(float16 a) {
return (0xFC00 <= (bits16)(a << 1));
}
flag float16_is_signaling_nan(float16 a) {
return (((a >> 9) & 0x3F) == 0x3E) && (a & 0x01FF);
}
enum {
float_flag_inexact = 1,
float_flag_divbyzero = 2,
float_flag_underflow = 4,
float_flag_overflow = 8,
float_flag_invalid = 16
};
void float_raise(int8 flags) {
float_exception_flags |= flags;
}
int pickNaNMulAdd(
flag aIsQNaN,
flag aIsSNaN,
flag bIsQNaN,
flag bIsSNaN,
flag cIsQNaN,
flag cIsSNaN,
flag infzero) {
if (infzero) {
float_raise(float_flag_invalid);
return 2;
}
if (cIsSNaN || cIsQNaN) {
return 2;
} else if (bIsSNaN || bIsQNaN) {
return 1;
} else {
return 0;
}
}
inline float16 packFloat16(flag zSign, int16 zExp, bits16 zSig) {
return (((bits16)zSign) << 15) + (((bits16)zExp) << 10) + zSig;
}
float16
propagateFloat16MulAddNaN(float16 a, float16 b, float16 c, flag infzero) {
flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, cIsQuietNaN,
cIsSignalingNaN;
int selNaN;
aIsQuietNaN = float16_is_quiet_nan(a);
aIsSignalingNaN = float16_is_signaling_nan(a);
bIsQuietNaN = float16_is_quiet_nan(b);
bIsSignalingNaN = float16_is_signaling_nan(b);
cIsQuietNaN = float16_is_quiet_nan(c);
cIsSignalingNaN = float16_is_signaling_nan(c);
if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) {
float_raise(float_flag_invalid);
}
selNaN = pickNaNMulAdd(
aIsQuietNaN,
aIsSignalingNaN,
bIsQuietNaN,
bIsSignalingNaN,
cIsQuietNaN,
cIsSignalingNaN,
infzero);
switch (selNaN) {
case 0:
return a | (1 << 9);
case 1:
return b | (1 << 9);
case 2:
return c | (1 << 9);
case 3:
default:
return float16_default_nan;
}
}
inline void shift32RightJamming(bits32 a, int16 count, bits32* zPtr) {
bits32 z;
if (count == 0) {
z = a;
} else if (count < 32) {
z = (a >> count) | ((a << ((-count) & 31)) != 0);
} else {
z = (a != 0);
}
*zPtr = z;
}
void shift16RightJamming(bits16 a, int16 count, bits16* zPtr) {
bits16 z;
if (count == 0) {
z = a;
} else if (count < 16) {
z = (a >> count) | (((a << ((-count) & 15)) & 0xffff) != 0);
} else {
z = (a != 0);
}
*zPtr = z;
}
Word8 GetRound(Word32 fcr) {
Word8 res, round_mode;
round_mode = fcr & 0x3; // lower 2 bits as rounding mode in FCR
res = (round_mode == 3)
? 1
: ((round_mode == 2)
? 2
: ((round_mode == 1) ? 3 : 0)); // Translate to float_rounding_mode
return res;
}
Word8 GetException(Word32 fsr) {
Word8 res = 0;
if (TESTBIT(fsr, 7) == 1)
res |= 32; // float_flag_inexact
if (TESTBIT(fsr, 8) == 1)
res |= 16; // float_flag_underflow
if (TESTBIT(fsr, 9) == 1)
res |= 8; // float_flag_overflow
if (TESTBIT(fsr, 10) == 1)
res |= 4; // float_flag_divbyzero
if (TESTBIT(fsr, 11) == 1)
res |= 1; // float_flag_invalid
return res;
}
float16 roundAndPackFloat16(flag zSign, int16 zExp, bits16 zSig) {
int8 roundingMode;
flag roundNearestEven;
int8 roundIncrement, roundBits;
flag isTiny;
roundingMode = float_rounding_mode;
roundNearestEven = (roundingMode == float_round_nearest_even);
roundIncrement = 0x8;
if (!roundNearestEven) {
// if ( ( ! roundNearestEven ) && ( roundingMode !=
// float_round_ties_away) ) {
if (roundingMode == float_round_to_zero) {
roundIncrement = 0;
} else {
roundIncrement = 0xF;
if (zSign) {
if (roundingMode == float_round_up)
roundIncrement = 0;
} else {
if (roundingMode == float_round_down)
roundIncrement = 0;
}
}
}
roundBits = zSig & 0xF;
if (0x1D <= (bits16)zExp) {
if ((0x1D < zExp) ||
((zExp == 0x1D) && ((sbits16)(zSig + roundIncrement) < 0))) {
float_raise(float_flag_overflow | float_flag_inexact);
return packFloat16(zSign, 0x1F, 0) - (roundIncrement == 0);
}
if (zExp < 0) {
isTiny = (float_detect_tininess == float_tininess_before_rounding) ||
(zExp < -1) || (zSig + roundIncrement < 0x8000);
shift16RightJamming(zSig, -zExp, &zSig);
zExp = 0;
roundBits = zSig & 0xF;
if (isTiny && roundBits)
float_raise(float_flag_underflow);
}
}
if (roundBits)
float_exception_flags |= float_flag_inexact;
zSig = (zSig + roundIncrement) >> 4;
zSig &= ~(((roundBits ^ 0x8) == 0) & roundNearestEven);
if (zSig == 0)
zExp = 0;
return packFloat16(zSign, zExp, zSig);
}
int8 countLeadingZeros32(bits32 a) {
static const int8 countLeadingZerosHigh[] = {
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int8 shiftCount;
shiftCount = 0;
if (a < 0x10000) {
shiftCount += 16;
a <<= 16;
}
if (a < 0x1000000) {
shiftCount += 8;
a <<= 8;
}
shiftCount += countLeadingZerosHigh[a >> 24];
return shiftCount;
}
void normalizeFloat16Subnormal(bits16 aSig, int16* zExpPtr, bits16* zSigPtr) {
int8 shiftCount;
shiftCount = countLeadingZeros32((bits32)aSig) - 16 - 5;
*zSigPtr = aSig << shiftCount;
*zExpPtr = 1 - shiftCount;
}
float16 float16_muladd(float16 a, float16 b, float16 c, flag negate_product) {
flag aSign, bSign, cSign, zSign;
int16 aExp, bExp, cExp, pExp, zExp, expDiff;
bits16 aSig, bSig, cSig;
flag pInf, pZero, pSign;
bits32 pSig32, cSig32, zSig32;
bits16 pSig;
int shiftcount;
flag infzero;
/* Extract the sign bit, exponent and significant */
aSig = extractFloat16Frac(a);
aExp = extractFloat16Exp(a);
aSign = extractFloat16Sign(a);
bSig = extractFloat16Frac(b);
bExp = extractFloat16Exp(b);
bSign = extractFloat16Sign(b);
cSig = extractFloat16Frac(c);
cExp = extractFloat16Exp(c);
cSign = extractFloat16Sign(c);
/* Flag to indicate fusedMultiplyAdd(0, inf, or fusedMultiplyAdd(inf, 0 c) */
infzero =
((aExp == 0 && aSig == 0 && bExp == 0x1f && bSig == 0) ||
(aExp == 0x1f && aSig == 0 && bExp == 0 && bSig == 0));
/* CASE1: if any input is NaN => NaN propagate */
/* It is implementation-defined whether the cases of (0,inf,qnan)
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
* they return if they do), so we have to hand this information
* off to the target-specific pick-a-NaN routine.
*/
/* IEEE754 7.2 - Invalid: fusedMultiplyAdd(0, inf, c) or
* fusedMultiplyAdd(inf, 0 , c) unless c is a quiet NaN; If c is a
* quiet NaN then it is implementation defined whether the invalid operation
* exception is signaled.
*/
if (((aExp == 0x1f) && aSig) || ((bExp == 0x1f) && bSig) ||
((cExp == 0x1f) && cSig)) {
return propagateFloat16MulAddNaN(a, b, c, infzero);
}
/* Work out the sign and type of the product */
pSign = aSign ^ bSign;
if (negate_product) {
pSign ^= 1;
}
/* CASE2: fusedMultiplyAdd(0, inf, c) or fusedMultiplyAdd(inf,0, c) and c is
* not NaN => raise invalid */
if (infzero) {
float_raise(float_flag_invalid);
return float16_default_nan;
}
pInf = (aExp == 0x1f) || (bExp == 0x1f);
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
/* CASE3 and CASE4: c is inf, p is number or inf*/
if (cExp == 0x1f) {
if (pInf && (pSign ^ cSign)) {
/* CASE3: addition of opposite-signed infinities => InvalidOperation */
float_raise(float_flag_invalid);
return float16_default_nan;
}
/* CASE4: Otherwise generate an infinity of the same sign */
return packFloat16(cSign, 0x1f, 0);
}
/* CASE5: c is number and p is inf */
if (pInf) {
return packFloat16(pSign, 0x1f, 0);
}
/* CASE6: c is number, p is zero */
if (pZero) {
if (cExp == 0) {
if (cSig == 0) {
/* Adding two exact zeroes */
if (pSign == cSign) {
zSign = pSign;
} else if (float_rounding_mode == float_round_down) {
zSign = 1;
} else {
zSign = 0;
}
return packFloat16(zSign, 0, 0);
}
}
/* CASE7: Zero plus something non-zero : just return the something */
return c;
}
if (aExp == 0) {
normalizeFloat16Subnormal(aSig, &aExp, &aSig);
}
if (bExp == 0) {
normalizeFloat16Subnormal(bSig, &bExp, &bSig);
}
/* Calculate the actual result a * b + c */
/* NOTE: we subtract 0x7e where float16_mul() subtracts 0x7f
* because we want the true exponent, not the "one-less-than"
* flavour that roundAndPackFloat16() takes.
*/
pExp = aExp + bExp - 0xe;
aSig = (aSig | 0x0400) << 4;
bSig = (bSig | 0x0400) << 5;
pSig32 = (bits32)aSig * bSig;
if ((sbits32)(pSig32 << 1) >= 0) {
pSig32 <<= 1;
pExp--;
}
zSign = pSign;
/* Now pSig32 is the significand of the multiply, with the explicit bit in
* position 30.
*/
if (cExp == 0) {
if (!cSig) {
/* Throw out the special case of c being an exact zero now */
shift32RightJamming(pSig32, 16, &pSig32);
pSig = pSig32;
return roundAndPackFloat16(zSign, pExp - 1, pSig);
}
normalizeFloat16Subnormal(cSig, &cExp, &cSig);
}
cSig32 = (bits32)cSig << (30 - 10);
cSig32 |= 0x40000000;
expDiff = pExp - cExp;
if (pSign == cSign) {
/* Addition */
if (expDiff > 0) {
/* scale c to match p */
shift32RightJamming(cSig32, expDiff, &cSig32);
zExp = pExp;
} else if (expDiff < 0) {
/* scale p to match c */
shift32RightJamming(pSig32, -expDiff, &pSig32);
zExp = cExp;
} else {
/* no scaling needed */
zExp = cExp;
}
/* Add significands and make sure explicit bit ends up in posn 62 */
zSig32 = pSig32 + cSig32;
if ((sbits32)zSig32 < 0) {
shift32RightJamming(zSig32, 1, &zSig32);
} else {
zExp--;
}
} else {
/* Subtraction */
if (expDiff > 0) {
shift32RightJamming(cSig32, expDiff, &cSig32);
zSig32 = pSig32 - cSig32;
zExp = pExp;
} else if (expDiff < 0) {
shift32RightJamming(pSig32, -expDiff, &pSig32);
zSig32 = cSig32 - pSig32;
zExp = cExp;
zSign ^= 1;
} else {
zExp = pExp;
if (cSig32 < pSig32) {
zSig32 = pSig32 - cSig32;
} else if (pSig32 < cSig32) {
zSig32 = cSig32 - pSig32;
zSign ^= 1;
} else {
/* Exact zero */
zSign = 0;
if (float_rounding_mode == float_round_down) {
zSign ^= 1;
}
return packFloat16(zSign, 0, 0);
}
}
--zExp;
/* Normalize to put the explicit bit back into bit 62. */
shiftcount = countLeadingZeros32(zSig32) - 1;
zSig32 <<= shiftcount;
zExp -= shiftcount;
}
shift32RightJamming(zSig32, 16, &zSig32);
return roundAndPackFloat16(zSign, zExp, zSig32);
}
void fp_mac_h(
Word16 d0,
Word16 d1,
Word16 d2,
Word32 negate_product,
Word32 fcr,
Word32 fsr_i,
Word16* res,
Word32* fsr_o) {
// Extract rounding mode from FCR/FSR to softfloat
float_rounding_mode = GetRound(fcr);
float_exception_flags = GetException(fsr_i);
// Call softfloat lib
*res = float16_muladd(d1, d2, d0, negate_product);
//*fsr_o = PutException(float_exception_flags, fsr_i);
}
void fma16(
const Word16 input,
const Word16 a,
const Word16 b,
const Word32 fcr,
const Word32 fsr_i,
Word16* result,
Word32* fsr_o) {
Word16 res;
Word32 fsr = 0;
// Call fp utility
fp_mac_h(b, input, a, 0, fcr, fsr_i, &res, &fsr);
// Output result
*fsr_o = fsr;
*result = res;
}
float fake_fma_fp16_slow(float v1, float v2, float v3) {
uint32_t fcr_val = 0;
uint32_t fsr_val = 0x00000F80;
uint32_t exception_flags = 0;
uint16_t hv1, hv2, hv3, hresult;
hv1 = _cvtss_sh(v1, 0);
hv2 = _cvtss_sh(v2, 0);
hv3 = _cvtss_sh(v3, 0);
fma16(
*reinterpret_cast<Word16*>(&hv1),
*reinterpret_cast<Word16*>(&hv2),
*reinterpret_cast<Word16*>(&hv3),
*reinterpret_cast<Word32*>(&fcr_val),
*reinterpret_cast<Word32*>(&fsr_val),
reinterpret_cast<Word16*>(&hresult),
reinterpret_cast<Word32*>(&exception_flags));
return _cvtsh_ss(hresult);
}
void fake_fma_fp16_slow(int N, const float* A, const float* B, float* Out) {
for (int n = 0; n < N; n++) {
Out[n] = fake_fma_fp16_slow(A[n], B[n], Out[n]);
}
}
} // namespace fp16_fma

View File

@ -1,41 +0,0 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include <vector>
#include "fp16_fma.h"
using namespace std;
using namespace fake_fp16;
TEST(FP16_FMA, Simple) {
int x = 1;
x += 2;
int N = 6;
vector<float> A(N, 1.23);
vector<float> B(N, 2.34);
vector<float> C(N, 3.45);
fma_fp16(N, A.data(), B.data(), C.data());
for (int i = 0; i < N; i++) {
LOG(INFO) << C[i] << " ";
ASSERT_TRUE(abs(C[i] - 6.32812) < 1e-3);
}
}
TEST(FP16_FMA, Comprehensive) {
#if 0
#pragma omp parallel num_threads(30)
for (uint16_t a = 0; a < 1 << 15; a++) {
for (uint16_t b = 0; b < 1 << 15; b++) {
for (uint16_t c = 0; c < 1 << 15; c++) {
uint16_t z = a + b * c;
// fake_fma_fp16_slow(A[0], B[0], C[0]);
}
}
}
fake_fma_fp16_slow(A[0], B[0], C[0]);
#endif
}

View File

@ -1,467 +0,0 @@
#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
#include <fbgemm/FbgemmConvert.h>
#include <fbgemm/FbgemmFP16.h>
#include <glog/logging.h>
#include <immintrin.h>
#include "caffe2/core/context.h"
#include "caffe2/utils/math.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
// dimA(before transpose) = M x N, dimA (after transpose) = N x M.
void transpose(const float* A, std::vector<float>& A_trans, int M, int N) {
CAFFE_ENFORCE_EQ(M * N, A_trans.size());
fbgemm::transpose_simd(M, N, A, N, A_trans.data(), M);
}
void custom_fp16_gemm_with_trans(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int m,
const int k,
const int n,
const float* A,
const float* B,
const float beta,
float* C,
const bool use_acc_fp16,
const bool use_temp_accumulator) {
switch (trans_A) {
case CblasNoTrans: {
switch (trans_B) {
case CblasNoTrans: {
// A * B
custom_fp16_gemm(
m, k, n, A, B, beta, C, use_acc_fp16, use_temp_accumulator);
break;
}
case CblasTrans: {
// A * B_trans
std::vector<float> B_trans(n * k);
transpose(B, B_trans, n, k);
custom_fp16_gemm(
m,
k,
n,
A,
B_trans.data(),
beta,
C,
use_acc_fp16,
use_temp_accumulator);
break;
}
default:
LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_B";
}
} break;
case CblasTrans: {
switch (trans_B) {
case CblasNoTrans: {
// A_trans * B
std::vector<float> A_trans(k * m);
transpose(A, A_trans, k, m);
custom_fp16_gemm(
m,
k,
n,
A_trans.data(),
B,
beta,
C,
use_acc_fp16,
use_temp_accumulator);
break;
}
case CblasTrans: {
// A_trans * B_trans
std::vector<float> A_trans(k * m);
std::vector<float> B_trans(n * k);
transpose(A, A_trans, k, m);
transpose(B, B_trans, n, k);
custom_fp16_gemm(
m,
k,
n,
A_trans.data(),
B_trans.data(),
beta,
C,
use_acc_fp16,
use_temp_accumulator);
break;
}
default:
LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_B";
}
} break;
default:
LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_A";
}
}
static inline __m256 clamp_subnormals(__m256 input, const float epsilon_) {
__m256 epsilon = _mm256_set1_ps(epsilon_);
__m256 nepsilon = _mm256_set1_ps(-epsilon_);
__m256 mask = _mm256_or_ps(
_mm256_cmp_ps(input, nepsilon, _CMP_LE_OS),
_mm256_cmp_ps(input, epsilon, _CMP_GE_OS));
return _mm256_and_ps(input, mask);
}
void custom_fp16_gemm(
const int m,
const int k,
const int n,
const float* A_fp16,
const float* B_fp16,
const float beta,
float* C,
const bool use_acc_fp16,
const bool use_temp_accumulator) {
#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
clock_t begin = clock();
#endif
int C_size = m * n;
if (beta == 0) {
// In Caffe2 we often do a lazy initialization, which may contain NaNs in
// the float values. As a result, if beta is 0, we explicitly do a setzero.
memset(C, 0, C_size * sizeof(C[0]));
} else {
float beta_fp16 = fbgemm::cpu_half2float(fbgemm::cpu_float2half_rn(beta));
__m256 mBetaFp16 = _mm256_broadcast_ss(&beta_fp16);
int i = 0;
for (i = 0; i + 8 <= C_size; i += 8) {
__m256 mC = _mm256_loadu_ps(C + i);
mC = _mm256_mul_ps(mC, mBetaFp16);
_mm256_storeu_ps(C + i, mC);
}
for (; i < C_size; i++) {
C[i] = C[i] * beta_fp16;
}
}
// Encode the smallest normal number in float16
union epsilon_t {
float f;
uint32_t i;
};
union epsilon_t epsilon;
epsilon.i = 0x38800000u; // 1 / 16384
constexpr int VLEn = 8;
const int kb_max = 128;
for (int i = 0; i < m; i++) {
for (int l = 0; l < k; l += kb_max) {
int kb = std::min(kb_max, k - l);
for (int j = 0; j < n; j += VLEn) {
int nb = std::min(VLEn, n - j);
if (nb == VLEn) {
__m256 mC = _mm256_loadu_ps(C + i * n + j);
__m256 mC_temp = _mm256_setzero_ps();
for (int l2 = l; l2 < l + kb; l2++) {
__m256 mA_fp16 = _mm256_broadcast_ss(A_fp16 + i * k + l2);
__m256 mB_fp16 = _mm256_loadu_ps((B_fp16 + l2 * n + j));
if (use_acc_fp16) {
mA_fp16 = clamp_subnormals(mA_fp16, epsilon.f);
mB_fp16 = clamp_subnormals(mB_fp16, epsilon.f);
}
__m256 mAB = _mm256_mul_ps(mA_fp16, mB_fp16);
if (use_acc_fp16) {
__m256 mAB_fp16 = _mm256_cvtph_ps(_mm256_cvtps_ph(mAB, 0));
mAB_fp16 = clamp_subnormals(mAB_fp16, epsilon.f);
if (use_temp_accumulator) {
mC_temp = _mm256_add_ps(mC_temp, mAB_fp16);
mC_temp = _mm256_cvtph_ps(_mm256_cvtps_ph(mC_temp, 0));
} else {
mC = _mm256_add_ps(mC, mAB_fp16);
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
}
} else {
if (use_temp_accumulator) {
mC_temp = _mm256_add_ps(mC_temp, mAB);
} else {
mC = _mm256_add_ps(mC, mAB);
}
}
if (use_acc_fp16) {
mC = clamp_subnormals(mC, epsilon.f);
}
}
if (use_temp_accumulator) {
if (use_acc_fp16) {
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
mC = _mm256_add_ps(mC, mC_temp);
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
} else {
mC = _mm256_add_ps(mC, mC_temp);
}
}
_mm256_storeu_ps(C + i * n + j, mC);
} else {
__m256 mC_temp = _mm256_setzero_ps();
int32_t mask_src[] = {
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
0,
0,
0,
0,
0,
0,
0,
0,
};
__m256i imask =
_mm256_loadu_si256((__m256i const*)(mask_src + 8 - nb));
__m256 mC = _mm256_maskload_ps(C + i * n + j, imask);
for (int l2 = l; l2 < l + kb; l2++) {
__m256 mA_fp16 = _mm256_broadcast_ss(A_fp16 + i * k + l2);
__m256 mB_fp16 = _mm256_maskload_ps(B_fp16 + l2 * n + j, imask);
if (use_acc_fp16) {
mA_fp16 = clamp_subnormals(mA_fp16, epsilon.f);
mB_fp16 = clamp_subnormals(mB_fp16, epsilon.f);
}
__m256 mAB = _mm256_mul_ps(mA_fp16, mB_fp16);
if (use_acc_fp16) {
__m256 mAB_fp16 = _mm256_cvtph_ps(_mm256_cvtps_ph(mAB, 0));
mAB_fp16 = clamp_subnormals(mAB_fp16, epsilon.f);
if (use_temp_accumulator) {
mC_temp = _mm256_add_ps(mC_temp, mAB_fp16);
mC_temp = _mm256_cvtph_ps(_mm256_cvtps_ph(mC_temp, 0));
} else {
mC = _mm256_add_ps(mC, mAB_fp16);
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
}
} else {
if (use_temp_accumulator) {
mC_temp = _mm256_add_ps(mC_temp, mAB);
} else {
mC = _mm256_add_ps(mC, mAB);
}
}
if (use_acc_fp16) {
mC = clamp_subnormals(mC, epsilon.f);
}
}
if (use_temp_accumulator) {
if (use_acc_fp16) {
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
mC = _mm256_add_ps(mC, mC_temp);
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
} else {
mC = _mm256_add_ps(mC, mC_temp);
}
}
_mm256_maskstore_ps(C + i * n + j, imask, mC);
}
}
}
}
if (!use_acc_fp16) {
constexpr int kSize=8;
int i = 0;
for (; i + kSize <= C_size; i+= kSize) {
__m256 mC = _mm256_loadu_ps(C + i);
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
_mm256_storeu_ps(C + i, mC);
}
if (i < C_size){
vector<float> tmp(8);
for (int kk =0; kk + i < C_size; kk++) {
tmp[kk] = C[i + kk];
}
__m256 mC = _mm256_loadu_ps(tmp.data());
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
_mm256_storeu_ps(tmp.data(), mC);
for (int kk =0; kk + i < C_size; kk++) {
C[i + kk] = tmp[kk];
}
}
}
#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCkS_PER_SEC;
VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
<< "cblas_gemm_compute_acc16 run time = " << elapsed_secs << endl;
#endif
}
void custom_fp16_gemv(
const bool use_acc_fp16,
const bool use_custom_acc32,
const bool use_temp_accumulator,
const CBLAS_TRANSPOSE trans_A,
const int M,
const int N,
const float alpha,
const float* A,
const float* x,
const float beta,
float* y,
CPUContext* context) {
if (use_acc_fp16) {
custom_fp16_gemm_with_trans(
trans_A,
CblasNoTrans,
M,
1,
N,
A,
x,
beta,
y,
true /* use acc_fp16 */,
use_temp_accumulator);
} else if (use_custom_acc32 && use_temp_accumulator) {
custom_fp16_gemm_with_trans(
trans_A,
CblasNoTrans,
M,
1,
N,
A,
x,
beta,
y,
false /* use acc_fp32 */,
use_temp_accumulator);
} else {
math::Gemv<float, CPUContext>(trans_A, M, N, alpha, A, x, beta, y, context);
}
}
void custom_fp16_gemm_batched(
const bool use_acc_fp16,
const bool use_custom_acc32,
const bool use_temp_accumulator,
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int batch_size,
const int M,
const int N,
const int K,
const float alpha,
const float** A,
const float** B,
const float beta,
float** C,
CPUContext* context) {
if (!use_acc_fp16 && (!use_custom_acc32 || !use_temp_accumulator)) {
math::GemmBatched<float, CPUContext>(
trans_A, trans_B, batch_size, M, N, K, alpha, A, B, beta, C, context);
return;
}
for (int i = 0; i < batch_size; ++i) {
if (use_acc_fp16) {
custom_fp16_gemm_with_trans(
trans_A,
trans_B,
M,
K,
N,
A[i],
B[i],
beta,
C[i],
true /* use acc_fp16 */,
use_temp_accumulator);
} else {
CAFFE_ENFORCE(use_custom_acc32 && use_temp_accumulator);
custom_fp16_gemm_with_trans(
trans_A,
trans_B,
M,
K,
N,
A[i],
B[i],
beta,
C[i],
false /* use acc_fp32 */,
use_temp_accumulator);
}
}
}
void custom_fp16_gemm_strided_batched(
const bool use_acc_fp16,
const bool use_custom_acc32,
const bool use_temp_accumulator,
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int batch_size,
const int M,
const int N,
const int K,
const float alpha /* unused */,
const float* A,
const int A_stride,
const float* B,
const int B_stride,
const float beta,
float* C,
const int C_stride,
CPUContext* context) {
// loop over matrices in the batch
for (int i = 0; i < batch_size; ++i) {
if (use_acc_fp16) {
custom_fp16_gemm_with_trans(
trans_A,
trans_B,
M,
K,
N,
A,
B,
beta,
C,
true /* use_acc_fp16 */,
use_temp_accumulator);
} else {
custom_fp16_gemm_with_trans(
trans_A,
trans_B,
M,
K,
N,
A,
B,
beta,
C,
false /* use acc_fp32*/,
use_temp_accumulator);
}
A += A_stride;
B += B_stride;
C += C_stride;
}
}
} // namespace caffe2

View File

@ -1,81 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "caffe2/core/context.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
void custom_fp16_gemm(
const int m,
const int k,
const int n,
const float* A_fp16,
const float* B_fp16,
const float beta,
float* C,
const bool use_acc_fp16,
const bool use_temp_accumulator);
void custom_fp16_gemm_with_trans(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int m,
const int k,
const int n,
const float* A_fp16,
const float* B_fp16,
const float beta,
float* C,
const bool use_acc_fp16,
const bool use_temp_accumulator);
void transpose(const float* A, float* A_trans, int M, int N);
void custom_fp16_gemv(
const bool use_acc_fp16,
const bool use_custom_acc32,
const bool use_temp_accumulator,
const CBLAS_TRANSPOSE trans_A,
const int M,
const int N,
const float alpha,
const float* A,
const float* x,
const float beta,
float* y,
CPUContext* context);
void custom_fp16_gemm_batched(
const bool use_acc_fp16,
const bool use_custom_acc32,
const bool use_temp_accumulator,
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int batch_size,
const int M,
const int N,
const int K,
const float alpha,
const float** A,
const float** B,
const float beta,
float** C,
CPUContext* context);
void custom_fp16_gemm_strided_batched(
const bool use_acc_fp16,
const bool use_custom_acc32,
const bool use_temp_accumulator,
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int batch_size,
const int M,
const int N,
const int K,
const float alpha /* unused */,
const float* A,
const int A_stride,
const float* B,
const int B_stride,
const float beta,
float* C,
const int C_stride,
CPUContext* context);
} // namespace caffe2

View File

@ -1,14 +0,0 @@
#include "caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(Int8DequantizeNNPI, int8::Int8DequantizeNNPIOp);
OPERATOR_SCHEMA(Int8DequantizeNNPI)
.IdenticalTypeAndShape()
.NumInputs(1)
.NumOutputs(1)
.Input(0, "qX", "Int8 Tensor qX.")
.Output(0, "Y", "FP32 Tensor that represents mapped real value of qX.");
} // namespace caffe2

View File

@ -1,57 +0,0 @@
#ifndef CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
#define CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
#include <fbgemm/FbgemmConvert.h>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/quantized/int8_utils.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
namespace int8 {
namespace {
void Int8DequantizeNNPI(
const uint8_t* in,
float* out,
const int64_t N,
const float X_scale,
const int32_t X_offset) {
float X_scale_fp32 = 1.0f / X_scale;
for (const auto i : c10::irange(N)) {
out[i] = (float)(static_cast<int32_t>(in[i]) - X_offset) / X_scale_fp32;
}
} // namespace
} // namespace
class Int8DequantizeNNPIOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
bool RunOnDevice() override {
const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
auto* Y = Output(0, X.t.sizes(), at::dtype<float>());
int32_t X_offset = X.zero_point;
auto X_scale = X.scale;
Int8DequantizeNNPI(
X.t.data<uint8_t>(),
Y->mutable_data<float>(),
X.t.numel(),
X_scale,
X_offset);
// UsingOneOverScale_);
return true;
}
};
} // namespace int8
} // namespace caffe2
#endif // CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_

View File

@ -1,15 +0,0 @@
#include "caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(Int8QuantizeNNPI, int8::Int8QuantizeNNPIOp);
OPERATOR_SCHEMA(Int8QuantizeNNPI)
.IdenticalTypeAndShape()
.Arg("Y_scale", "Output tensor quantization scale")
.Arg("Y_zero_point", "Output tensor quantization offset")
.NumInputs(1)
.NumOutputs(1)
.Input(0, "X", "FP32 Tensor X.")
.Output(0, "Y", "Int8 Tensor qX representing X with linear quantization.");
} // namespace caffe2

View File

@ -1,108 +0,0 @@
#ifndef CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
#define CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
#include <fbgemm/FbgemmConvert.h>
#include <cmath>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/quantized/int8_utils.h"
#include "fp16_fma.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
namespace int8 {
namespace {
static float ClampScale(float s)
{
const float MinScale(1e-10f);
if (std::fabs(s) < MinScale) {
LOG_EVERY_N(WARNING, 1000) << "Too small scale detected: "
<< s << " clamping to +/-" << MinScale;
return std::signbit(s) ? -MinScale : MinScale;
} else {
return s;
}
}
void Int8QuantizeNNPI(
const float* in,
uint8_t* out,
const int64_t N,
const float Y_scale,
const int32_t Y_offset) {
const int32_t qmin = std::numeric_limits<uint8_t>::min();
const int32_t qmax = std::numeric_limits<uint8_t>::max();
float inv_scale = ClampScale(1 / Y_scale);
float inv_scale_fp16 = 0;
fbgemm::RoundToFloat16(
&inv_scale, &inv_scale_fp16, 1, false /* no clamping */);
float offset_tmp = -Y_offset;
fbgemm::RoundToFloat16(
&offset_tmp, &offset_tmp, 1, false /* no clamping */);
std::vector<float> in_fp16(N);
fbgemm::RoundToFloat16(
in, in_fp16.data(), N, false /* no clamping */);
std::vector<float> inv_scalev(N, inv_scale_fp16);
std::vector<float> offsetv(N, -offset_tmp);
fake_fp16::fma_fp16(N, in_fp16.data(), inv_scalev.data(), offsetv.data());
for (const auto i : c10::irange(N)) {
offsetv[i] = round(offsetv[i]);
}
fbgemm::RoundToFloat16(
offsetv.data(), offsetv.data(), N, false /* no clamping */);
for (const auto i : c10::irange(N)) {
float halfRes = offsetv[i];
if (std::isinf(halfRes)) {
if (halfRes > 0) {
halfRes = qmax;
} else {
halfRes = qmin;
}
}
if (halfRes > qmax) {
halfRes = qmax;
}
if (halfRes < qmin) {
halfRes = qmin;
}
out[i] = static_cast<uint8_t>(halfRes);
}
}
} // namespace
class Int8QuantizeNNPIOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
bool RunOnDevice() override {
const auto& X = Input(0);
auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
Y->t.ResizeLike(X);
int32_t Y_offset =
this->template GetSingleArgument<int>("Y_zero_point", 0);
auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
Y->scale = Y_scale;
Y->zero_point = Y_offset;
Int8QuantizeNNPI(
X.data<float>(),
Y->t.mutable_data<uint8_t>(),
X.numel(),
Y_scale,
Y_offset);
return true;
}
};
} // namespace int8
} // namespace caffe2
#endif // CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_

View File

@ -1,27 +0,0 @@
#include "caffe2/contrib/fakelowp/int8_swish_op_nnpi.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SwishFakeInt8NNPI, int8::SwishInt8NNPIOp);
OPERATOR_SCHEMA(SwishFakeInt8NNPI)
.IdenticalTypeAndShape()
.Arg("X_scale", "Inout tensor quantization scale")
.Arg("X_zero_point", "Input tensor quantization offset")
.Arg("Y_scale", "Output tensor quantization scale")
.Arg("Y_zero_point", "Output tensor quantization offset")
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Apply the Swish function element-wise after dequantizing input tensor.
$$Swish(x) = \frac{x}{1+\exp(-x)}$$
Quantize the Swish function output back to Int8.
The input and output of this operator are converted to fp16 precision
before applying the function.
<details>
</details>
)DOC")
.Input(0, "X", "Int8 Tensor X.")
.Output(0, "Y", "Int8 Tensor Y.");
} // namespace caffe2

View File

@ -1,87 +0,0 @@
#ifndef CAFFE2_OPERATORS_INT8_SWISH_OP_H_
#define CAFFE2_OPERATORS_INT8_SWISH_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/quantized/int8_utils.h"
namespace caffe2 {
namespace int8 {
namespace {
using namespace std;
void SwishFakeInt8NNPI(
const uint8_t* in,
uint8_t* out,
const int64_t N,
const float X_scale,
const int32_t X_offset,
const float Y_scale,
const int32_t Y_offset) {
const uint8_t max_val = std::numeric_limits<uint8_t>::max();
const uint8_t min_val = std::numeric_limits<uint8_t>::min();
float X_scale_fp32 = 1.0f / X_scale;
float deq_val = 0.0f;
float deq_swish = 0.0f;
int32_t quant_val = 0;
uint8_t result = 0;
for (const auto i : c10::irange(N)) {
deq_val = (static_cast<uint8_t>(in[i]) - X_offset) / X_scale_fp32;
deq_swish = deq_val / (1 + exp(-deq_val));
quant_val = round(deq_swish / Y_scale + Y_offset);
result = quant_val;
if (quant_val > max_val) {
result = max_val;
}
if (quant_val < min_val) {
result = min_val;
}
out[i] = static_cast<uint8_t>(result);
}
}
} // namespace
class SwishInt8NNPIOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
template <class... Args>
explicit SwishInt8NNPIOp(Args&&... args)
: Operator<CPUContext>(std::forward<Args>(args)...) {}
bool RunOnDevice() override {
const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
Y->t.ResizeLike(X.t);
int32_t Y_offset_ =
this->template GetSingleArgument<int>("Y_zero_point", 0);
auto Y_scale_ = this->template GetSingleArgument<float>("Y_scale", 1);
Y->scale = Y_scale_;
Y->zero_point = Y_offset_;
SwishFakeInt8NNPI(
X.t.data<uint8_t>(),
Y->t.mutable_data<uint8_t>(),
X.t.numel(),
X.scale,
X.zero_point,
Y_scale_,
Y_offset_);
return true;
}
};
} // namespace int8
} // namespace caffe2
#endif // CAFFE2_OPERATORS_INT8_SWISH_OP_H_

View File

@ -1,201 +0,0 @@
#include <algorithm>
#include "layernorm_fp16_fake_op.h"
#include "caffe2/contrib/fakelowp/common.h"
#include "caffe2/contrib/fakelowp/fp16_fma.h"
namespace caffe2 {
void LayerNormUtils::calcY(
const int M,
const int N,
const float* X,
const float* mean,
const float* std,
const float* gamma,
const float* beta,
float* Y) {
ConstEigenArrayMap<float> X_arr(X, N, M);
ConstEigenVectorArrayMap<float> mean_arr(mean, M);
ConstEigenVectorArrayMap<float> std_arr(std, M);
EigenArrayMap<float> Y_arr(Y, N, M);
std::vector<float> normalized(N);
for (int i = 0; i < M; ++i) {
float normFactor = float(1.0f / std_arr[i]);
fbgemm::RoundToFloat16(&normFactor, &normFactor, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
for (int j = 0; j < N; ++j) {
normalized[j] = X_arr.col(i)[j] - mean[i];
}
fbgemm::RoundToFloat16(normalized.data(), normalized.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
for (int j = 0; j < N; ++j) {
normalized[j] *= normFactor;
}
fbgemm::RoundToFloat16(normalized.data(), &Y_arr.col(i)[0], N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
if (gamma != nullptr && beta != nullptr) {
ConstEigenVectorArrayMap<float> gamma_arr(gamma, N);
ConstEigenVectorArrayMap<float> beta_arr(beta, N);
for (int i = 0; i < M; ++i) {
vector<float> res(N);
for (int j = 0; j < N; j++) {
res[j] = beta[j];
}
fake_fp16::fma_fp16(N, &Y_arr.col(i)[0], gamma, res.data());
for (int j = 0; j < N; j++) {
Y_arr.col(i)[j] = res[j];
}
}
}
}
float LayerNormUtils::ReducedAdd(const std::vector<float>& vec) {
constexpr int VEC_SIZE = 32;
std::vector<float> v(vec.begin(), vec.end());
for (int factor = 2; factor <=32; factor *=2) {
int range = VEC_SIZE / factor;
for (int i = 0; i < range; ++i) { // 16
v[i] = v[2 * i] + v[2 * i + 1];
}
fbgemm::RoundToFloat16(v.data(), v.data(), range, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
return v[0];
}
void LayerNormUtils::calcMeanStd(
const int M,
const int N,
const float eps,
const float* X,
float* mean,
float* std) {
ConstEigenArrayMap<float> X_arr(X, N, M);
std::vector<float> sqr(M, 0.0f);
std::vector<float> var(M, 0.0f);
float inv_N_val = 1.0f / N;
fbgemm::RoundToFloat16(&inv_N_val, &inv_N_val, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
constexpr int VEC_SIZE = 32;
std::vector<float> inv_N_vec(VEC_SIZE, inv_N_val);
std::vector<float> inv_N_prod_vec(VEC_SIZE, 0);
std::vector<float> avgVec(VEC_SIZE, 0.0f);
std::vector<float> sqrVec(VEC_SIZE, 0.0f);
std::vector<float> negMeanVec(M, 0.0f);
int numVecs = N / VEC_SIZE;
int tailSize = N - (numVecs * VEC_SIZE);
vector<float> X_fp16(M * N);
fbgemm::RoundToFloat16(
X, X_fp16.data(), M * N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
for (int i = 0; i < M; ++i) {
std::fill(avgVec.begin(), avgVec.end(), 0.0f);
std::fill(sqrVec.begin(), sqrVec.end(), 0.0f);
for (int j = 0; j < numVecs; ++j) {
fake_fp16::fma_fp16(
VEC_SIZE,
&X_fp16[i * N + VEC_SIZE * j],
inv_N_vec.data(),
avgVec.data());
for (int k = 0; k < VEC_SIZE; k++) {
inv_N_prod_vec[k] = X_fp16[i * N + VEC_SIZE * j + k] * inv_N_val;
}
fbgemm::RoundToFloat16(
inv_N_prod_vec.data(),
inv_N_prod_vec.data(),
VEC_SIZE,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fake_fp16::fma_fp16(
VEC_SIZE,
&X_fp16[i * N + VEC_SIZE * j],
inv_N_prod_vec.data(),
sqrVec.data());
}
if (tailSize > 0) {
fake_fp16::fma_fp16(
tailSize,
&X_fp16[i * N + VEC_SIZE * numVecs],
inv_N_vec.data(),
avgVec.data());
for (int k = 0; k < tailSize; k++) {
inv_N_prod_vec[k] = X_fp16[i * N + VEC_SIZE * numVecs + k] * inv_N_val;
}
fbgemm::RoundToFloat16(
inv_N_prod_vec.data(),
inv_N_prod_vec.data(),
tailSize,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fake_fp16::fma_fp16(
tailSize,
&X_fp16[i * N + VEC_SIZE * numVecs],
inv_N_prod_vec.data(),
sqrVec.data());
}
mean[i] = ReducedAdd(avgVec);
sqr[i] = ReducedAdd(sqrVec);
}
// // compute variance and std deviation
std::copy(mean, mean + M, negMeanVec.begin());
std::transform(negMeanVec.cbegin(),
negMeanVec.cend(),
negMeanVec.begin(),
std::negate<float>());
fake_fp16::fma_fp16(M, mean, negMeanVec.data(), sqr.data());
std::copy(sqr.cbegin(), sqr.cend(), var.begin());
float teps = eps;
std::vector<float> tmpVec(M, 0.0f);
fbgemm::RoundToFloat16(&teps, &teps, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
int i = 0;
for (auto& v: var) {
if (v < 0.0) {
LOG_EVERY_N(WARNING, 1000) << "Variance " << v
<< " negative, resetting to 0.";
v = 0.0;
}
tmpVec[i] = var[i] + teps;
++i;
}
fbgemm::RoundToFloat16(
tmpVec.data(),
tmpVec.data(),
M,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
i = 0;
for (auto& v: tmpVec) {
if (v < 0) {
LOG_EVERY_N(WARNING, 1000) << "Variance " << v
<< " negative, resetting to 0.";
v = 0.0;
}
std[i] = std::sqrt(v);
++i;
}
fbgemm::RoundToFloat16(
std,
std,
M,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
REGISTER_CPU_OPERATOR(LayerNormFakeFP16NNPI, LayerNormFakeFp16Op<false>);
OPERATOR_SCHEMA(LayerNormFakeFP16NNPI).NumInputs({1, 3}).NumOutputs(3);
REGISTER_CPU_OPERATOR(LayerNormInt8QuantizeFakeNNPI,
LayerNormFakeFp16Op<true>);
OPERATOR_SCHEMA(LayerNormInt8QuantizeFakeNNPI)
.IdenticalTypeAndShape()
.NumInputs({1, 3})
.NumOutputs(3);
} // namespace caffe2

View File

@ -1,207 +0,0 @@
#pragma once
#include <algorithm>
#include <array>
#include <functional>
#include <string>
#include <vector>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include <fbgemm/FbgemmConvert.h>
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
#include "fp16_fma.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
class LayerNormUtils {
public:
static void calcY(
const int M,
const int N,
const float* X,
const float* mean,
const float* std,
const float* gamma,
const float* beta,
float* Y);
static void calcMeanStd(
const int M,
const int N,
const float eps,
const float* X,
float* mean,
float* std);
static float ReducedAdd(const std::vector<float>& vec);
};
template <bool quantizeOutput=false>
class LayerNormFakeFp16Op final : public Operator<CPUContext> {
public:
template <class... Args>
explicit LayerNormFakeFp16Op(Args&&... args)
: Operator<CPUContext>(std::forward<Args>(args)...),
OP_SINGLE_ARG(int, "axis", axis_, 1),
OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5f),
OP_SINGLE_ARG(bool, "elementwise_affine", elementwise_affine_, false) {}
~LayerNormFakeFp16Op() noexcept override {}
bool RunOnDevice() override {
return DoRunWithType();
}
bool DoRunWithType() {
const auto& X = Input(INPUT);
vector <float> Y_fp16;
Tensor *Y;
if (!quantizeOutput) {
Y = Output(OUTPUT, X.sizes(), at::dtype<float>());
} else {
Y_fp16.resize(X.numel());
}
CAFFE_ENFORCE_GE(X.dim(), 2, "LayerNorm requires input dim >=2.");
const int canonical_axis = X.canonical_axis_index(axis_);
std::vector<int64_t> moments_dims(
X.sizes().cbegin(), X.sizes().cbegin() + canonical_axis);
moments_dims.push_back(1);
auto* mean = Output(MEAN, moments_dims, at::dtype<float>());
auto* sigma = Output(STD, moments_dims, at::dtype<float>());
const int M = X.size_to_dim(canonical_axis);
const int N = X.size_from_dim(canonical_axis);
if (!quantizeOutput) {
Y->ResizeLike(X);
}
const float* X_data = X.template data<float>();
float *Y_data;
if (!quantizeOutput) {
Y_data = Y->template mutable_data<float>();
} else {
Y_data = Y_fp16.data();
}
float* mean_data = mean->template mutable_data<float>();
float* sigma_data = sigma->template mutable_data<float>();
std::vector<float> X_rounded(X.numel());
fbgemm::RoundToFloat16(
X_data,
X_rounded.data(),
X.numel(),
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
false /*USE_ACC_FP16*/);
X_data = X_rounded.data();
// Mean and Standard Deviation computation for the input data
LayerNormUtils::calcMeanStd(M, N, epsilon_, X_data, mean_data, sigma_data);
const float* gamma_data = nullptr;
const float* beta_data = nullptr;
// Layer Normalized Output computation
LayerNormUtils::calcY(
M, N, X_data, mean_data, sigma_data, gamma_data, beta_data, Y_data);
if (InputSize() == 3) {
// handle scale and bias via fp16_fma
std::vector<float> scale_data(N);
std::vector<float> bias_data(N);
fbgemm::RoundToFloat16(
Input(1).template data<float>(),
scale_data.data(),
N,
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
false /*USE_ACC_FP16*/);
fbgemm::RoundToFloat16(
Input(2).template data<float>(),
bias_data.data(),
N,
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
false /*USE_ACC_FP16*/);
for (const auto i : c10::irange(M)) {
// fma_fp16(A, B, Out) -> Out = A * B + Out
std::vector<float> out(N);
std::memcpy(out.data(), bias_data.data(), sizeof(float) * N);
fake_fp16::fma_fp16(N, Y_data + i * N, scale_data.data(), out.data());
std::memcpy(Y_data + i * N, out.data(), sizeof(float) * N);
}
}
// Quantize
// We should be using the same quantization fucntion from int8quantize,
// but we need to adjust for int8 vs uint8 bias. A simple shift of the output is not enough
// because this causes problems when rounding inside the fma.
// TODO: figure out how to commonize this with int8 quantize
if (quantizeOutput) {
auto* Y_int8 = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>();
Y_int8->t.ResizeLike(X);
int32_t Y_offset =
this->template GetSingleArgument<int>("Y_zero_point", 0);
auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
float inv_scale = 1.0f / Y_scale;
fbgemm::RoundToFloat16(
&inv_scale, &inv_scale, 1, false /* no clamping */);
Y_int8->scale = Y_scale;
Y_int8->zero_point = Y_offset;
int Nout = X.numel();
std::vector<float> inv_scalev(Nout, inv_scale);
std::vector<float> offsetv(Nout, Y_offset);
uint8_t* Y_uint8_data = Y_int8->t.template mutable_data<uint8_t>();
fake_fp16::fma_fp16(Nout, Y_fp16.data(), inv_scalev.data(), offsetv.data());
const int32_t qmin = std::numeric_limits<uint8_t>::min();
const int32_t qmax = std::numeric_limits<uint8_t>::max();
for (const auto i : c10::irange(Nout)) {
float halfRes = offsetv[i];
halfRes = round(halfRes);
if (std::isinf(halfRes)) {
if (halfRes > 0) {
halfRes = qmax;
} else {
halfRes = qmin;
}
}
if (halfRes > qmax) {
halfRes = qmax;
}
if (halfRes < qmin) {
halfRes = qmin;
}
Y_uint8_data[i] = static_cast<uint8_t>(halfRes);
}
}
return true;
}
private:
const int axis_;
const float epsilon_;
// LayerNorm FP16 FakeLowP Op applies the scales and biases (or gamma and beta)
// whenever those inputs are provided else it will omit them.
// We are keeping elementwise_affine to keep it consistent with LayerNorm FP32 Op.
const bool elementwise_affine_;
INPUT_TAGS(INPUT);
OUTPUT_TAGS(OUTPUT, MEAN, STD);
};
} // namespace caffe2

View File

@ -1,163 +0,0 @@
#include "lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused4BitRowwiseFakeFP16NNPI,
SparseLengthsFused4BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false>);
OPERATOR_SCHEMA(SparseLengthsSumFused4BitRowwiseFakeFP16NNPI)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::DATA,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::INDICES,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
4-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 2-byte scale and 2-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused4BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused4BitRowwiseFakeFP16NNPI);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly,
SparseLengthsFused4BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false,
/*use_fp16_for_embedding_only=*/true>);
OPERATOR_SCHEMA(SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::DATA,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::
INDICES,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::
LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
4-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 2-byte scale and 2-byte bias).
Convert only embedding entries using fake fp16.
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused4BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI,
SparseLengthsFused4BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::DATA,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::INDICES,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::LENGTHS,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 4-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 2-byte scale and 2-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused4BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly,
SparseLengthsFused4BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/true,
/*use_fp16_for_embedding_only=*/true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::DATA,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
INDICES,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
LENGTHS,
SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 4-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 2-byte scale and 2-byte bias).
Convert only embedding entries using fake fp16.
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused4BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly);
} // namespace caffe2

View File

@ -1,216 +0,0 @@
#pragma once
#include <immintrin.h>
#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
#include "fp16_fma.h"
#include "lengths_reducer_ops.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
namespace caffe2 {
template <
class Context,
bool with_weights = 0,
bool use_fp16_for_embedding_only = 0>
class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
explicit SparseLengthsFused4BitRowwiseFakeFP16Op(
const OperatorDef& operator_def,
Workspace* ws)
: Operator<Context>(operator_def, ws) {}
~SparseLengthsFused4BitRowwiseFakeFP16Op() noexcept override {}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, Input(INDICES));
}
template <typename IndexType>
bool DoRunWithType() {
const auto& data = Input(DATA);
const auto& indices = Input(INDICES);
const auto& lengths = Input(LENGTHS);
CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES must be a vector");
CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS must be a vector");
const float* weights = nullptr;
if (with_weights) {
const auto& weights_input = Input(WEIGHTS);
CAFFE_ENFORCE_EQ(weights_input.dim(), 1, "WEIGHTS must be a vector");
CAFFE_ENFORCE_EQ(
weights_input.numel(),
indices.numel(),
"WEIGHTS should have the same length as INDICES.");
weights = weights_input.template data<float>();
}
CAFFE_ENFORCE_GT(
data.size(1),
sizeof(at::Half) * 2,
"DATA must have more than 4 columns");
constexpr int NUM_ELEM_PER_BYTE = 2;
// Subtract 8 from the #columns of data for the 4 bytes for scale and 4
// bytes for bias that we use in the fused representation (per row).
const std::vector<int64_t> shape = {
lengths.size(0),
static_cast<int64_t>(data.size(1) - 2 * sizeof(at::Half)) *
NUM_ELEM_PER_BYTE};
auto* output = Output(0, shape, at::dtype<float>());
// Copied from Fused8BitRowwiseEmbeddingLookupGenericSlow in
// fused_8bit_rowwise_embedding_lookup.cc
int64_t output_block_size = output->size(1);
CAFFE_ENFORCE_EQ(
output_block_size % NUM_ELEM_PER_BYTE,
0,
"block size must be divisible by 2");
int64_t input_block_size = output_block_size / NUM_ELEM_PER_BYTE;
int64_t output_size = output->size(0);
int64_t index_size = indices.numel();
int64_t data_size = data.size(0);
const uint8_t* input = data.template data<uint8_t>();
const IndexType* indices_data = indices.template data<IndexType>();
const int* lengths_data = lengths.template data<int>();
float* out = output->template mutable_data<float>();
std::vector<float> rowTempSums[2];
rowTempSums[0].resize(output_block_size);
rowTempSums[1].resize(output_block_size);
const auto scale_bias_offset = 2 * sizeof(at::Half);
const int64_t input_fused_block_size = input_block_size + scale_bias_offset;
int64_t current = 0;
for (const auto m : c10::irange(output_size)) {
if (!use_fp16_for_embedding_only) {
memset(rowTempSums[0].data(), 0, sizeof(float) * output_block_size);
memset(rowTempSums[1].data(), 0, sizeof(float) * output_block_size);
}
memset(out, 0, sizeof(float) * output_block_size);
if (current + lengths_data[m] > index_size) {
return false;
}
for (int i = 0; i < lengths_data[m]; ++i) {
int64_t idx = indices_data[current];
int accIdx = 0;
if (output_block_size % 2 == 0 && output_block_size <= 96 &&
data.size(1) % 2 == 0) {
accIdx = i % 2;
}
if (idx < 0 || idx >= data_size) {
return false;
}
const at::Half* scale_bias = reinterpret_cast<const at::Half*>(
input + input_fused_block_size * indices_data[current] +
input_block_size);
float weight = 1.0f;
if (weights) {
weight = weights[current];
if (!use_fp16_for_embedding_only) {
// Fake fp16 rounding of weight
fbgemm::RoundToFloat16(
&weight, &weight, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
}
float scale = scale_bias[0];
float bias = scale_bias[1];
if (!use_fp16_for_embedding_only) {
scale *= weight;
fbgemm::RoundToFloat16(
&scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
// Unpack int4 elements
std::vector<float> input_rounded(output_block_size);
int k = 0;
for (const auto j : c10::irange(input_block_size)) {
input_rounded[k++] =
input[input_fused_block_size * indices_data[current] + j] & 0x0f;
input_rounded[k++] =
input[input_fused_block_size * indices_data[current] + j] >> 4;
}
if (use_fp16_for_embedding_only) {
std::vector<float> product_rounded(output_block_size);
TypedAxpy<float, float>(
output_block_size,
scale,
input_rounded.data(),
product_rounded.data());
for (const auto j : c10::irange(output_block_size)) {
product_rounded[j] += bias;
}
// Fake fp16 rounding of scale x input + bias
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(product_rounded.data()),
product_rounded.data(),
output_block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
// Accumulate w x (scale x input + bias) to output
TypedAxpy<float, float>(
output_block_size,
weight,
reinterpret_cast<const float*>(product_rounded.data()),
out);
} else {
std::vector<float> product(output_block_size);
std::vector<float> scalev(output_block_size, scale);
std::vector<float> mBias(output_block_size, bias);
std::vector<float> mWeight(output_block_size, weight);
fake_fp16::fma_fp16(
output_block_size,
mBias.data(),
mWeight.data(),
rowTempSums[accIdx].data());
fake_fp16::fma_fp16(
output_block_size,
scalev.data(),
input_rounded.data(),
rowTempSums[accIdx].data());
}
++current;
}
if (!use_fp16_for_embedding_only) {
for (const auto j : c10::irange(output_block_size)) {
out[j] = rowTempSums[0][j] + rowTempSums[1][j];
}
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(out),
out,
output_block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
out += output_block_size;
}
return current == index_size;
}
enum {
DATA = 0,
WEIGHTS = 1,
INDICES = 1 + with_weights,
LENGTHS = 2 + with_weights,
};
};
} // namespace caffe2

View File

@ -1,722 +0,0 @@
#include "lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused8BitRowwiseFakeFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>);
OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
8-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false,
/*is_mean=*/false,
/*use_acc_fp16=*/false,
/*use_inv_scale=*/false,
/*use_nnpi_fma=*/false,
/*use_fp16_for_embedding_only=*/true>);
OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
false,
false,
false,
true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
false,
false,
false,
true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
false,
false,
false,
true>::LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
8-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 4-byte scale and 4-byte bias).
Convert only embedding entries using fake fp16.
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused8BitRowwiseFakeFP16NNPI,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false,
/*is_mean=*/false,
/*use_acc_fp16=*/true,
/*use_inv_scale=*/false,
/*use_nnpi_fma=*/true>);
OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16NNPI)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
8-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16NNPI);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused8BitRowwiseFakeFP32NNPI,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false,
/*is_mean=*/false,
/*use_acc_fp16=*/false,
/*use_inv_scale=*/false,
/*use_nnpi_fp16_fma=*/false,
/*use_fp16_for_embedding_only*/ false,
/*use_acc_fp32*/ true>);
OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP32NNPI)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
false,
true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
false,
true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
false,
true>::LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
8-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP32NNPI);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false,
/*is_mean=*/false,
/*use_acc_fp16=*/true>);
OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
8-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights*/ false,
/*is_mean*/ 0,
/*use_acc_fp16*/ true,
/*use_inv_scale*/ true>);
OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
false,
false,
true>::LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsSum, but operating on
8-bit rowwise quantized matrices with fused storage (where each row
stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output")
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused8BitRowwiseFakeFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, /*with_weights=*/true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::LENGTHS,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/true,
/*is_mean=*/false,
/*use_acc_fp16=*/false,
/*use_inv_scale=*/false,
/*use_nnpi_fma=*/false,
/*use_fp16_for_embedding_only=*/true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
true>::LENGTHS,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
true>::WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
Convert only embedding entries using fake fp16.
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/true,
/*is_mean=*/false,
/*use_acc_fp16=*/true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
LENGTHS,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/true,
/*is_mean=*/false,
/*use_acc_fp16=*/true,
/*use_inv_scale=*/false,
/*use_fma=*/true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
LENGTHS,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/true,
/*is_mean=*/false,
/*use_acc_fp16=*/false,
/*use_inv_scale=*/false,
/*use_nnpi_fp16_fma=*/false,
/*use_fp16_for_embedding_only*/ false,
/*use_acc_fp32*/ true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
false,
true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
false,
true>::INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
false,
true>::LENGTHS,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
true,
false,
false,
false,
false,
false,
true>::WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/true,
/*is_mean=*/false,
/*use_acc_fp16=*/true,
/*use_inv_scale=*/true>);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16)
.NumInputs(4)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
LENGTHS,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
WEIGHTS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsWeightedSum,
but operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Input(
3,
"WEIGHTS",
"Vector of weights to scale rows of DATA with before reduction")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16);
REGISTER_CPU_OPERATOR(
SparseLengthsMeanFused8BitRowwiseFakeFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false,
/*is_mean=*/true>);
OPERATOR_SCHEMA(SparseLengthsMeanFused8BitRowwiseFakeFP16)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::
INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::
LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsMean, but
operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsMeanFused8BitRowwiseFakeFP16);
REGISTER_CPU_OPERATOR(
SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16,
SparseLengthsFused8BitRowwiseFakeFP16Op<
CPUContext,
/*with_weights=*/false,
/*is_mean=*/true,
/*use_acc_fp16=*/true>);
OPERATOR_SCHEMA(SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16)
.NumInputs(3)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
DATA,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
INDICES,
SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
LENGTHS)
.SetDoc(R"DOC(
Performs the same operation as SparseLengthsMean, but
operating on 8-bit rowwise quantized matrices with fused storage
(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
)DOC")
.Input(
0,
"DATA",
"uint8 tensor obtained with "
"operator FloatToFused8BitRowwiseQuantized")
.Input(
1,
"INDICES",
"Integer vector containing indices of the first "
"dimension of DATA for the slices that are being aggregated")
.Input(
2,
"LENGTHS",
"Vector with the same sum of elements as the first dimension of DATA")
.Output(0, "output", "output");
NO_GRADIENT(SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16);
} // namespace caffe2

View File

@ -1,312 +0,0 @@
#pragma once
#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
#include "fp16_fma.h"
#include "lengths_reducer_ops.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
namespace caffe2 {
template <
class Context,
bool with_weights = 0,
bool is_mean = 0,
bool use_acc_fp16 = 0,
bool use_inv_scale = 0,
bool use_nnpi_fma = 0,
bool use_fp16_for_embedding_only = 0,
bool use_acc_fp32 = 0>
class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
public:
static_assert(
!(with_weights && is_mean),
"Cannot have with_weights and is_mean a the same time");
USE_OPERATOR_CONTEXT_FUNCTIONS;
explicit SparseLengthsFused8BitRowwiseFakeFP16Op(
const OperatorDef& operator_def,
Workspace* ws)
: Operator<Context>(operator_def, ws) {}
~SparseLengthsFused8BitRowwiseFakeFP16Op() noexcept override {}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, Input(INDICES));
}
template <typename IndexType>
bool DoRunWithType() {
const auto& data = Input(DATA);
const auto& indices = Input(INDICES);
const auto& lengths = Input(LENGTHS);
CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES must be a vector");
CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS must be a vector");
const float* weights = nullptr;
if (with_weights) {
const auto& weights_input = Input(WEIGHTS);
CAFFE_ENFORCE_EQ(weights_input.dim(), 1, "WEIGHTS must be a vector");
CAFFE_ENFORCE_EQ(
weights_input.numel(),
indices.numel(),
"WEIGHTS should have the same length as INDICES.");
weights = weights_input.template data<float>();
}
CAFFE_ENFORCE_GT(data.size(1), 8, "DATA must have more than 8 columns");
// Subtract 8 from the #columns of data for the 4 bytes for scale and 4
// bytes for bias that we use in the fused representation (per row).
const std::vector<int64_t> shape = {lengths.size(0), data.size(1) - 8};
auto* output = Output(0, shape, at::dtype<float>());
// Copied from Fused8BitRowwiseEmbeddingLookupGenericSlow in
// fused_8bit_rowwise_embedding_lookup.cc
int64_t block_size = output->size(1);
int64_t output_size = output->size(0);
int64_t index_size = indices.numel();
int64_t data_size = data.size(0);
const uint8_t* input = data.template data<uint8_t>();
const IndexType* indices_data = indices.template data<IndexType>();
const int* lengths_data = lengths.template data<int>();
bool normalize_by_length = is_mean;
float* out = output->template mutable_data<float>();
std::vector<float> rowTempSums[2];
rowTempSums[0].resize(block_size);
rowTempSums[1].resize(block_size);
// block_size is the number of elements and fused_block_size is the size of
// an entire row, including scale and bias.
const auto scale_bias_offset = 8 / sizeof(uint8_t);
const int64_t fused_block_size = block_size + scale_bias_offset;
int64_t current = 0;
for (const auto m : c10::irange(output_size)) {
memset(out, 0, sizeof(float) * block_size);
memset(rowTempSums[0].data(), 0, sizeof(float) * block_size);
memset(rowTempSums[1].data(), 0, sizeof(float) * block_size);
if (current + lengths_data[m] > index_size) {
return false;
}
for (int i = 0; i < lengths_data[m]; ++i) {
int64_t idx = indices_data[current];
int accIdx = 0;
// Only do double buffer accumulation when block size is even
if (use_nnpi_fma && block_size % 2 == 0 && block_size <= 96) {
accIdx = i % 2;
}
if (idx < 0 || idx >= data_size) {
return false;
}
const float* scale_bias = reinterpret_cast<const float*>(
input + fused_block_size * indices_data[current] + block_size);
float weight = 1.0f;
if (weights) {
weight = weights[current];
if (!use_fp16_for_embedding_only && !use_acc_fp32) {
// Fake fp16 rounding of weight
fbgemm::RoundToFloat16(
&weight, &weight, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
}
float scale = scale_bias[0];
float bias = scale_bias[1];
// Vendor might store scale as s' = 1 / s which implies b' = b / s
// We do x = x_q * s + b
// Vendor does x = (x_q + b') / s'
// Solving these equations yields to the results above
if (use_inv_scale) {
constexpr float kEpsilon = 1e-8;
if (fabs(scale) < kEpsilon) {
if (scale < 0) {
scale = -kEpsilon;
} else {
scale = kEpsilon;
}
}
scale = 1.0 / (1.0 / scale);
bias = (bias / scale) * scale;
}
if (!use_fp16_for_embedding_only && !use_acc_fp32) {
// Fake fp16 rounding of scale and bias
fbgemm::RoundToFloat16(
&scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
&bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
scale *= weight;
// Fake fp16 rounding of scale and bias
fbgemm::RoundToFloat16(
&scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
// Fake fp16 rounding of input/ it is already ints
std::vector<float> input_rounded(block_size);
for (const auto j : c10::irange(block_size)) {
input_rounded[j] =
input[fused_block_size * indices_data[current] + j];
}
if (use_fp16_for_embedding_only) {
// bias *= weight;
std::vector<float> product_rounded(block_size);
TypedAxpy<float, float>(
block_size, scale, input_rounded.data(), product_rounded.data());
for (const auto j : c10::irange(block_size)) {
product_rounded[j] += bias;
}
// Fake fp16 rounding of scale x input + bias
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(product_rounded.data()),
product_rounded.data(),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
// Accumulate w x (scale x input + bias) to output
TypedAxpy<float, float>(
block_size,
weight,
reinterpret_cast<const float*>(product_rounded.data()),
out);
} else if (use_nnpi_fma) {
std::vector<float> mScale(block_size, scale);
std::vector<float> mBias(block_size, bias);
std::vector<float> mWeight(block_size, weight);
fake_fp16::fma_fp16(
block_size,
mBias.data(),
mWeight.data(),
rowTempSums[accIdx].data());
fake_fp16::fma_fp16(
block_size,
mScale.data(),
input_rounded.data(),
rowTempSums[accIdx].data());
} else if (use_acc_fp16) {
bias *= weight;
fbgemm::RoundToFloat16(
&bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
std::vector<float> product_rounded(block_size);
TypedAxpy<float, float>(
block_size, scale, input_rounded.data(), product_rounded.data());
// Fake fp16 rounding of w x scale x input
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(product_rounded.data()),
product_rounded.data(),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
for (const auto j : c10::irange(block_size)) {
product_rounded[j] += bias;
}
// Fake fp16 rounding of w x scale x input + w x bias
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(product_rounded.data()),
product_rounded.data(),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
// Accumulate w x scale x input + w x bias to output
TypedAxpy<float, float>(
block_size,
1.0,
reinterpret_cast<const float*>(product_rounded.data()),
out);
// Fake fp16 rounding of out + (w x scale x input + w x bias)
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(out),
out,
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
} else if (use_acc_fp32) {
for (const auto j : c10::irange(block_size)) {
float deqVal = fake_fp16::fmafp32_avx_emulation(
scale,
input_rounded[j],
bias);
rowTempSums[accIdx][j] = fake_fp16::fmafp32_avx_emulation(
deqVal,
weight,
rowTempSums[accIdx][j]);
}
} else {
bias *= weight;
fbgemm::RoundToFloat16(
&bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
TypedAxpy<float, float>(block_size, scale, input_rounded.data(), out);
for (const auto j : c10::irange(block_size)) {
out[j] += bias;
}
}
++current;
}
if (use_nnpi_fma || use_acc_fp32) {
for (const auto j : c10::irange(block_size)) {
out[j] = rowTempSums[0][j] + rowTempSums[1][j];
}
}
if (use_nnpi_fma) {
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(out),
out,
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
if (normalize_by_length && lengths_data[m]) {
float scale = 1.f / lengths_data[m];
if (!use_fp16_for_embedding_only) {
// Fake fp16 rounding of scale and out
fbgemm::RoundToFloat16(
&scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(out),
out,
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
// hack: context is not really used
math::Scale<float, float, CPUContext>(
block_size, scale, out, out, nullptr);
}
out += block_size;
}
return current == index_size;
}
enum {
DATA = 0,
WEIGHTS = 1,
INDICES = 1 + with_weights,
LENGTHS = 2 + with_weights,
};
};
} // namespace caffe2

View File

@ -1,217 +0,0 @@
#include "lengths_reducer_ops.h"
#include "caffe2/operators/segment_reduction_op.h"
namespace caffe2 {
// Use _STR option because the schema is declared using _STR version too in
// generic fashion. Otherwise it'd break schema declaration check.
// TODO(dzhulgakov): remove _STR when all lengths ops are off generic version.
using SparseLengthsSumOp =
SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 0>;
using SparseLengthsWeightedSumOp =
SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 1, 0>;
using SparseLengthsMeanOp =
SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 1>;
using SparseLengthsSumAccFP16Op =
SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 0, 0, 1>;
using SparseLengthsWeightedSumAccFP16Op =
SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 1, 0, 0, 1>;
using SparseLengthsMeanAccFP16Op =
SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 1, 0, 1>;
using SparseLengthsSumFakeFP16EmbeddingOnlyOp =
SparseLengthsReductionFakeFp16Op<
TensorTypes<float, at::Half>,
0,
0,
0,
0,
1>;
using SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp =
SparseLengthsReductionFakeFp16Op<
TensorTypes<float, at::Half>,
1,
0,
0,
0,
1>;
using SparseLengthsMeanFakeFP16EmbeddingOnlyOp =
SparseLengthsReductionFakeFp16Op<
TensorTypes<float, at::Half>,
0,
1,
0,
0,
1>;
REGISTER_CPU_OPERATOR(SparseLengthsSumFakeFP16, SparseLengthsSumOp);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFakeFP16,
SparseLengthsWeightedSumOp);
REGISTER_CPU_OPERATOR(SparseLengthsMeanFakeFP16, SparseLengthsMeanOp);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFakeFP16AccFP16,
SparseLengthsSumAccFP16Op);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFakeFP16AccFP16,
SparseLengthsWeightedSumAccFP16Op);
REGISTER_CPU_OPERATOR(
SparseLengthsMeanFakeFP16AccFP16,
SparseLengthsMeanAccFP16Op);
REGISTER_CPU_OPERATOR(
SparseLengthsSumFakeFP16EmbeddingOnly,
SparseLengthsSumFakeFP16EmbeddingOnlyOp);
REGISTER_CPU_OPERATOR(
SparseLengthsWeightedSumFakeFP16EmbeddingOnly,
SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp);
REGISTER_CPU_OPERATOR(
SparseLengthsMeanFakeFP16EmbeddingOnly,
SparseLengthsMeanFakeFP16EmbeddingOnlyOp);
template <typename Def>
string FormatDoc() {
string doc = Def::doc;
c10::ReplaceAll(doc, "{op}", Def::OpDef::name);
c10::ReplaceAll(doc, "{op_doc}", Def::OpDef::doc);
auto replaced = c10::ReplaceAll(doc, "{extra}", "");
CAFFE_ENFORCE_EQ(replaced, 0);
return doc;
}
using SparseLengthsSumDef = AbstractSparseLengthsDef<
float,
int,
CPUContext,
SumReducerDef,
true /*GradientNeedIndices*/>;
OPERATOR_SCHEMA(SparseLengthsSumFakeFP16)
.NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsSumOp::DATA,
SparseLengthsSumOp::INDICES,
SparseLengthsSumOp::LENGTHS)
.SetDoc(FormatDoc<SparseLengthsSumDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsSumDef::PopulateSchema)
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFakeFP16);
using SparseLengthsWeightedSumDef = AbstractSparseLengthsDef<
float,
int,
CPUContext,
WeightedSumReducerDef,
true /*GradientNeedIndices*/>;
OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16)
.NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsWeightedSumOp::DATA,
SparseLengthsWeightedSumOp::INDICES,
SparseLengthsWeightedSumOp::LENGTHS,
SparseLengthsWeightedSumOp::WEIGHT)
.SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsWeightedSumFakeFP16);
using SparseLengthsMeanDef = AbstractSparseLengthsDef<
float,
int,
CPUContext,
MeanReducerDef,
true /*GradientNeedIndices*/>;
OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16)
.NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsMeanOp::DATA,
SparseLengthsMeanOp::INDICES,
SparseLengthsMeanOp::LENGTHS)
.SetDoc(FormatDoc<SparseLengthsMeanDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsMeanDef::PopulateSchema);
NO_GRADIENT(SparseLengthsMeanFakeFP16);
OPERATOR_SCHEMA(SparseLengthsSumFakeFP16AccFP16)
.NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsSumOp::DATA,
SparseLengthsSumOp::INDICES,
SparseLengthsSumOp::LENGTHS)
.SetDoc(FormatDoc<SparseLengthsSumDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsSumDef::PopulateSchema)
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFakeFP16AccFP16);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16AccFP16)
.NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsWeightedSumOp::DATA,
SparseLengthsWeightedSumOp::INDICES,
SparseLengthsWeightedSumOp::LENGTHS,
SparseLengthsWeightedSumOp::WEIGHT)
.SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsWeightedSumFakeFP16AccFP16);
OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16AccFP16)
.NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsMeanOp::DATA,
SparseLengthsMeanOp::INDICES,
SparseLengthsMeanOp::LENGTHS)
.SetDoc(FormatDoc<SparseLengthsMeanDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsMeanDef::PopulateSchema);
NO_GRADIENT(SparseLengthsMeanFakeFP16AccFP16);
OPERATOR_SCHEMA(SparseLengthsSumFakeFP16EmbeddingOnly)
.NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsSumFakeFP16EmbeddingOnlyOp::DATA,
SparseLengthsSumFakeFP16EmbeddingOnlyOp::INDICES,
SparseLengthsSumFakeFP16EmbeddingOnlyOp::LENGTHS)
.SetDoc(FormatDoc<SparseLengthsSumDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsSumDef::PopulateSchema)
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsSumFakeFP16EmbeddingOnly);
OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16EmbeddingOnly)
.NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.WeightedValueKeyLengthInputFillers(
SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::DATA,
SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::INDICES,
SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::LENGTHS,
SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::WEIGHT)
.SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
.InheritOnnxSchema();
NO_GRADIENT(SparseLengthsWeightedSumFakeFP16EmbeddingOnly);
OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16EmbeddingOnly)
.NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
.NumOutputs(1)
.ValueKeyLengthInputFillers(
SparseLengthsMeanFakeFP16EmbeddingOnlyOp::DATA,
SparseLengthsMeanFakeFP16EmbeddingOnlyOp::INDICES,
SparseLengthsMeanFakeFP16EmbeddingOnlyOp::LENGTHS)
.SetDoc(FormatDoc<SparseLengthsMeanDef>())
.Output(0, "OUTPUT", "Aggregated tensor")
.FillUsing(SparseLengthsMeanDef::PopulateSchema);
NO_GRADIENT(SparseLengthsMeanFakeFP16EmbeddingOnly);
} // namespace caffe2

View File

@ -1,268 +0,0 @@
#pragma once
#include <fbgemm/FbgemmConvert.h>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/perfkernels/typed_axpy.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
namespace caffe2 {
// A templated class that implements SparseLengths[Sum,WeightedSum,Mean].
template <
class InputTypes, // supported input types, such as TensorTypes<float>
bool USE_WEIGHT = 0, // Whether it is SparseLengthsWeightedSum
bool USE_MEAN = 0, // Whether this is SparseLengthsMean
bool USE_POSITIONAL_WEIGHT = 0,
bool USE_ACC_FP16 = 0, // Whether use fp16 accumulation
bool USE_FP16_FOR_EMBEDDING_ONLY =
0 // Whether use fp16 for embedding entries only
// USE_WEIGHT = 1 and USE_POSITIONAL_WEIGHT = 1
// -> SparseLengthsPositionalWeightedSum
>
class SparseLengthsReductionFakeFp16Op final : public Operator<CPUContext> {
public:
USE_OPERATOR_FUNCTIONS(CPUContext);
template <class... Args>
explicit SparseLengthsReductionFakeFp16Op(Args&&... args)
: Operator<CPUContext>(std::forward<Args>(args)...) {
static_assert(
!(USE_WEIGHT & USE_MEAN), "Cannot both specify weight and mean.");
}
~SparseLengthsReductionFakeFp16Op() noexcept override {}
// Currently, we support float and at::Half inputs for input data type, and
// int32_t and int64_t for the index type.
bool RunOnDevice() override {
return DispatchHelper<InputTypes>::call(this, Input(DATA));
}
template <typename InputType>
bool DoRunWithType() {
return DispatchHelper<TensorTypes2<int32_t, int64_t>, InputType>::call(
this, Input(INDICES));
}
template <typename InputType, typename IndexType>
bool DoRunWithType2() {
auto& dataInput = Input(DATA);
auto& indicesInput = Input(INDICES);
auto& lengthsInput = Input(LENGTHS);
CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");
const int64_t N = dataInput.size(0);
const int D = dataInput.size_from_dim(1);
const int64_t M = lengthsInput.size(0);
const int64_t indices_size = indicesInput.numel();
auto shape = dataInput.sizes().vec();
shape[0] = M;
auto* output = Output(0, shape, at::dtype<float>());
float* out_data = output->template mutable_data<float>();
const InputType* in_data = dataInput.template data<InputType>();
const IndexType* indices = indicesInput.template data<IndexType>();
const int* lengths = lengthsInput.template data<int>();
const float* in_weight = nullptr;
if (USE_WEIGHT) {
// static if
auto& weightInput = Input(WEIGHT);
CAFFE_ENFORCE_EQ(1, weightInput.dim(), "WEIGHT must be a vector");
if (!USE_POSITIONAL_WEIGHT) {
CAFFE_ENFORCE_EQ(
weightInput.numel(),
indices_size,
"Weight should have the same length as indices.");
}
in_weight = weightInput.template data<float>();
}
// Copied from EmbeddingLookupGenericSlow in perfkernels/embedding_lookup.cc
int64_t block_size = D;
int64_t output_size = M;
int64_t index_size = indices_size;
int64_t data_size = N;
const InputType* input = in_data;
const float* weights = in_weight;
bool normalize_by_lengths = USE_MEAN;
float* out = out_data;
int64_t current = 0;
for (const auto m : c10::irange(output_size)) {
memset(out, 0, sizeof(float) * block_size);
if (current + lengths[m] > index_size) {
return false;
}
for (int i = 0; i < lengths[m]; ++i) {
int64_t idx = indices[current];
if (idx < 0 || idx >= data_size) {
return false;
}
float w = 1.f;
if (weights) {
w = weights[USE_POSITIONAL_WEIGHT ? i : current];
if (!USE_FP16_FOR_EMBEDDING_ONLY) {
// Fake fp16 rounding of w
fbgemm::RoundToFloat16(
&w, &w, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
}
if (USE_FP16_FOR_EMBEDDING_ONLY) {
std::vector<float> product_rounded(block_size);
if (std::is_same<InputType, at::Half>::value) {
TypedAxpy<InputType, float>(
block_size,
w,
input + block_size * indices[current],
product_rounded.data());
} else {
bool is_float = std::is_same<InputType, float>::value;
assert(is_float);
// Fake fp16 rounding of input
std::vector<float> input_rounded(block_size);
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(
input + block_size * indices[current]),
input_rounded.data(),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
TypedAxpy<float, float>(
block_size,
w,
reinterpret_cast<const float*>(input_rounded.data()),
product_rounded.data());
}
// Accumulate w x input to output
TypedAxpy<float, float>(
block_size,
1.0,
reinterpret_cast<const float*>(product_rounded.data()),
out);
} else if (USE_ACC_FP16) {
std::vector<float> product_rounded(block_size);
if (std::is_same<InputType, at::Half>::value) {
TypedAxpy<InputType, float>(
block_size,
w,
input + block_size * indices[current],
product_rounded.data());
} else {
bool is_float = std::is_same<InputType, float>::value;
assert(is_float);
// Fake fp16 rounding of input
std::vector<float> input_rounded(block_size);
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(
input + block_size * indices[current]),
input_rounded.data(),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
TypedAxpy<float, float>(
block_size,
w,
reinterpret_cast<const float*>(input_rounded.data()),
product_rounded.data());
}
// Fake fp16 rounding of w x input
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(product_rounded.data()),
product_rounded.data(),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
// Accumulate w x input to output
TypedAxpy<float, float>(
block_size,
1.0,
reinterpret_cast<const float*>(product_rounded.data()),
out);
// Fake fp16 rounding of out + w x input
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(out),
out,
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
} else {
if (std::is_same<InputType, at::Half>::value) {
TypedAxpy<InputType, float>(
block_size, w, input + block_size * indices[current], out);
} else {
bool is_float = std::is_same<InputType, float>::value;
assert(is_float);
// Fake fp16 rounding of input
std::vector<float> input_rounded(block_size);
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(
input + block_size * indices[current]),
input_rounded.data(),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
TypedAxpy<float, float>(
block_size,
w,
reinterpret_cast<const float*>(input_rounded.data()),
out);
}
}
++current;
}
if (normalize_by_lengths && lengths[m]) {
float scale = 1.f / lengths[m];
if (!USE_FP16_FOR_EMBEDDING_ONLY) {
// Fake fp16 rounding of scale and out
fbgemm::RoundToFloat16(
&scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(out),
out,
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
// hack: context is not really used
math::Scale<float, float, CPUContext>(
block_size, scale, out, out, nullptr);
}
if (!USE_FP16_FOR_EMBEDDING_ONLY) {
// Fake fp16 rounding of out
fbgemm::RoundToFloat16(
reinterpret_cast<const float*>(out),
reinterpret_cast<float*>(out),
block_size,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
out += block_size;
}
return current == index_size;
}
enum {
DATA = 0, // Data input.
WEIGHT = 1, // Weight input used in SparseLengthsWeightedSum
INDICES = 1 + USE_WEIGHT, // 1 in SparseLengths[Sum,Mean] and
// 2 in SparseLengthsWeightedSum
LENGTHS = 2 + USE_WEIGHT, // 2 in SparseLengths[Sum, Mean],
// 3 in SparseLengthsWeightedSum
};
};
} // namespace caffe2

View File

@ -1,20 +0,0 @@
#include "caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(TanhQuantFakeFp16NNPI, TanhInt8QuantizeNNPIOp);
OPERATOR_SCHEMA(TanhQuantFakeFp16NNPI)
.Arg("Y_scale", "Output tensor quantization scale")
.Arg("Y_zero_point", "Output tensor quantization offset")
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Apply TanH and convert the result to Int8.
<details>
</details>
)DOC")
.Input(0, "X", "Float Tensor X.")
.Output(0, "Y", "Int8 Tensor Y.");
} // namespace caffe2

View File

@ -1,91 +0,0 @@
#pragma once
#include <array>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/quantized/int8_utils.h"
#include <immintrin.h>
#include <emmintrin.h>
namespace caffe2 {
namespace {
class TanhInt8QuantizeNNPIOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
bool RunOnDevice() override {
const auto& X = Input(0);
auto* Y = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>();
Y->t.ResizeLike(X);
int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
Y->scale = Y_scale;
Y->zero_point = Y_offset;
constexpr int tanhLUTMinOffset = 0;
constexpr int tanhLUTMaxOffset = 18000;
constexpr int lutSize = tanhLUTMaxOffset - tanhLUTMinOffset;
std::array<uint8_t, lutSize> tanhLUT;
Y_scale = 1.0f / Y_scale;
// create table once
for (const auto i : c10::irange(lutSize)) {
short input = i + tanhLUTMinOffset;
float x = _cvtsh_ss(input);
float tanh_x = tanh(x);
tanh_x = round(tanh_x * Y_scale + Y_offset);
if (tanh_x < 0 || tanh_x > 255.0) {
tanh_x = 255.0;
}
uint32_t tanh_quant = (uint32_t)(tanh_x);
tanhLUT[i] = (uint8_t)tanh_quant;
}
const float* X_data = X.template data<float>();
for (const auto i : c10::irange(X.numel())) {
short val = _cvtss_sh(X_data[i], 0);
unsigned short max16BitPositive = 0x7FFF;
unsigned short input16Bit = (*(unsigned short*)& val);
short shortAbsInput = input16Bit & max16BitPositive; // mask out negative bit
short clampShortAbsInput = shortAbsInput;
if (shortAbsInput < (short)tanhLUTMinOffset) {
clampShortAbsInput = (short)tanhLUTMinOffset;
}
if (shortAbsInput > (short)(tanhLUTMaxOffset - 1)) {
clampShortAbsInput = (short)(tanhLUTMaxOffset - 1);
}
short inputInLutRange = clampShortAbsInput - tanhLUTMinOffset;
short temp = tanhLUT[inputInLutRange];
if (input16Bit > max16BitPositive) { // negative value
temp = temp - Y_offset;
temp = temp * (-1);
temp = temp + Y_offset;
}
uint8_t output = (uint8_t)temp;
if (temp < 0) {
output = 0;
}
Y->t.mutable_data<uint8_t>()[i] = output;
}
return true;
}
};
}
}

View File

@ -1,15 +0,0 @@
#include "spatial_batch_norm_fp16_fake_op.h"
#include <array>
#include "caffe2/utils/eigen_utils.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SpatialBNFakeLoweredFp16NNPI, SpatialBNFakeLoweredFp16Op);
OPERATOR_SCHEMA(SpatialBNFakeLoweredFp16NNPI).NumInputs({1, 5}).NumOutputs(1);
REGISTER_CPU_OPERATOR(SpatialBNFakeFp16NNPI, SpatialBNFakeFp16Op);
OPERATOR_SCHEMA(SpatialBNFakeFp16NNPI).NumInputs({1, 5}).NumOutputs(1);
} // namespace caffe2

View File

@ -1,395 +0,0 @@
#pragma once
#include <algorithm>
#include <array>
#include <functional>
#include <string>
#include <vector>
#include <fbgemm/FbgemmConvert.h>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
#include "fp16_fma.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
class SpatialBNFakeLoweredFp16Op : public Operator<CPUContext> {
public:
USE_OPERATOR_FUNCTIONS(CPUContext);
template <class... Args>
explicit SpatialBNFakeLoweredFp16Op(Args&&... args)
: Operator<CPUContext>(std::forward<Args>(args)...),
OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, false),
OP_SINGLE_ARG(double, "epsilon", epsilon_, 1e-5),
order_(StringToStorageOrder(
this->template GetSingleArgument<std::string>("order", "NCHW"))),
OP_SINGLE_ARG(int, "num_batches", num_batches_, 1) {
// TODO: only support NCHW for now
CAFFE_ENFORCE_EQ(order_, StorageOrder::NCHW);
CAFFE_ENFORCE(
(is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
CAFFE_ENFORCE_GT(epsilon_, 0);
}
~SpatialBNFakeLoweredFp16Op() override = default;
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
}
template <typename T>
bool DoRunWithType() {
const auto& X = Input(INPUT);
const auto& scale = Input(SCALE);
const auto& bias = Input(BIAS);
const int ndim = X.dim();
CAFFE_ENFORCE_GE(ndim, 2);
const int N = X.dim32(0);
const int C =
(order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
const int HxW =
std::accumulate(
X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
C;
CAFFE_ENFORCE_EQ(scale.numel(), C);
CAFFE_ENFORCE_EQ(bias.numel(), C);
auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
T* Y_data = Y->template mutable_data<T>();
ReinitializeTensor(
&alpha_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
T* alpha_data = alpha_.template mutable_data<T>();
// We only support this case at the moment
CAFFE_ENFORCE(is_test_);
std::vector<float> X_fp16(X.numel());
fbgemm::RoundToFloat16(
X.template data<T>(),
X_fp16.data(),
N * C * HxW,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
if (N == 0) {
return true;
}
const auto& mean = Input(EST_MEAN);
const auto& var = Input(EST_VAR);
CAFFE_ENFORCE_EQ(mean.numel(), C);
CAFFE_ENFORCE_EQ(var.numel(), C);
std::vector<float> mean_fp16(C), var_fp16(C);
std::vector<float> scale_fp16(C), bias_fp16(C);
fbgemm::RoundToFloat16(
scale.template data<T>(),
scale_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
bias.template data<T>(),
bias_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
mean.template data<T>(),
mean_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
var.template data<T>(),
var_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
EigenVectorArrayMap<T> alpha_arr(alpha_data, C);
std::vector<float> tmp(C);
EigenVectorArrayMap<T> tmp_arr(tmp.data(), C);
auto epsilon = static_cast<T>(epsilon_);
fbgemm::RoundToFloat16(
&epsilon, &epsilon, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
tmp_arr = (ConstEigenVectorArrayMap<T>(var_fp16.data(), C) + epsilon);
fbgemm::RoundToFloat16(
tmp.data(), tmp.data(), C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
tmp_arr = tmp_arr.pow(0.5);
fbgemm::RoundToFloat16(
tmp.data(), tmp.data(), C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
alpha_arr = ConstEigenVectorArrayMap<T>(scale_fp16.data(), C) / tmp_arr;
fbgemm::RoundToFloat16(
alpha_data, alpha_data, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
AffineChannel_NCHW(
N,
C,
HxW,
X_fp16.data(),
alpha_data,
bias_fp16.data(),
mean_fp16.data(),
Y_data);
fbgemm::RoundToFloat16(
Y_data, Y_data, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
protected:
void AffineChannel_NCHW(
const int N,
const int C,
const int HxW,
const float* X,
const float* scale,
const float* bias,
const float* mean,
float* Y) {
ConstEigenVectorArrayMap<float> scale_arr(scale, C);
ConstEigenVectorArrayMap<float> bias_arr(bias, C);
ConstEigenVectorArrayMap<float> mean_arr(mean, C);
const int stride = C * HxW;
const float* X_ptr = X;
float* Y_ptr = Y;
for ([[maybe_unused]] const auto i : c10::irange(N)) {
EigenArrayMap<float>(Y_ptr, HxW, C) =
ConstEigenArrayMap<float>(X_ptr, HxW, C).rowwise() -
mean_arr.transpose();
fbgemm::RoundToFloat16(
Y_ptr, Y_ptr, HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
EigenArrayMap<float>(Y_ptr, HxW, C).rowwise() *= scale_arr.transpose();
fbgemm::RoundToFloat16(
Y_ptr, Y_ptr, HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
EigenArrayMap<float>(Y_ptr, HxW, C).rowwise() += bias_arr.transpose();
X_ptr += stride;
Y_ptr += stride;
}
fbgemm::RoundToFloat16(
Y, Y, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
const bool is_test_;
double epsilon_;
const StorageOrder order_;
const int num_batches_;
Tensor alpha_;
INPUT_TAGS(
INPUT,
SCALE,
BIAS,
EST_MEAN,
EST_VAR,
BATCH_MEAN_SUM,
BATCH_VAR_SUM);
OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_STD);
};
// Emulation of the NNPI SpatialBN kernel
class SpatialBNFakeFp16Op : public Operator<CPUContext> {
public:
USE_OPERATOR_FUNCTIONS(CPUContext);
template <class... Args>
explicit SpatialBNFakeFp16Op(Args&&... args)
: Operator<CPUContext>(std::forward<Args>(args)...),
OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, false),
OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5),
order_(StringToStorageOrder(
this->template GetSingleArgument<std::string>("order", "NCHW"))),
OP_SINGLE_ARG(int, "num_batches", num_batches_, 1) {
// TODO: only support NCHW for now
CAFFE_ENFORCE_EQ(order_, StorageOrder::NCHW);
// We only support this case at the moment
CAFFE_ENFORCE(is_test_);
CAFFE_ENFORCE_GT(epsilon_, 0);
}
~SpatialBNFakeFp16Op() override = default;
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
}
template <typename T>
bool DoRunWithType() {
LOG(INFO) << "Running with " << sizeof(T);
const auto& X = Input(INPUT);
const auto& scale = Input(SCALE);
const auto& bias = Input(BIAS);
const int ndim = X.dim();
CAFFE_ENFORCE_GE(ndim, 2);
const int N = X.dim32(0);
const int C =
(order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
const int HxW =
std::accumulate(
X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
C;
CAFFE_ENFORCE_EQ(scale.numel(), C);
CAFFE_ENFORCE_EQ(bias.numel(), C);
auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
T* Y_data = Y->template mutable_data<T>();
ReinitializeTensor(
&alpha_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
ReinitializeTensor(
&beta_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
T* alpha_data = alpha_.template mutable_data<T>();
T* beta_data = beta_.template mutable_data<T>();
std::vector<float> X_fp16(X.numel());
fbgemm::RoundToFloat16(
X.template data<T>(),
X_fp16.data(),
N * C * HxW,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
const auto& mean = Input(EST_MEAN);
const auto& var = Input(EST_VAR);
CAFFE_ENFORCE_EQ(mean.numel(), C);
CAFFE_ENFORCE_EQ(var.numel(), C);
std::vector<float> mean_fp16(C), var_fp16(C);
std::vector<float> scale_fp16(C), bias_fp16(C);
fbgemm::RoundToFloat16(
scale.template data<T>(),
scale_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
bias.template data<T>(),
bias_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
mean.template data<T>(),
mean_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
var.template data<T>(),
var_fp16.data(),
C,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
// This part is run on the CPU/x86 core
ComputeFusedParam<T>(
C,
scale_fp16.data(),
bias_fp16.data(),
mean_fp16.data(),
var_fp16.data(),
alpha_data,
beta_data);
AffineChannel_NCHW(N, C, HxW, X_fp16.data(), alpha_data, beta_data, Y_data);
fbgemm::RoundToFloat16(
Y_data, Y_data, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
protected:
template <typename T>
void ComputeFusedParam(
const int C,
const T* scale,
const T* bias,
const T* mean,
const T* var,
T* alpha,
T* beta) {
// alpha = scale / sqrt(var + epsilon)
// beta = bias - alpha * mean
EigenVectorArrayMap<T> alpha_arr(alpha, C);
EigenVectorArrayMap<T> beta_arr(beta, C);
std::vector<T> tmp(C, 0.0);
EigenVectorArrayMap<T> tmp_arr(tmp.data(), C);
tmp_arr = ConstEigenVectorArrayMap<T>(var, C) + static_cast<T>(epsilon_);
// sqrt using intrinsics
int i = 0;
constexpr int blockSize = 8;
for (i = 0; i + blockSize <= C; i += blockSize) {
__m256 t = _mm256_loadu_ps(&tmp[i]);
_mm256_storeu_ps(&tmp[i], _mm256_sqrt_ps(t));
}
for (; i < C; i++) {
tmp[i] = sqrt(tmp[i]);
}
alpha_arr = ConstEigenVectorArrayMap<T>(scale, C) / tmp_arr;
beta_arr = ConstEigenVectorArrayMap<T>(bias, C) -
alpha_arr * ConstEigenVectorArrayMap<T>(mean, C);
fbgemm::RoundToFloat16(
alpha, alpha, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(beta, beta, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
void AffineChannel_NCHW(
const int N,
const int C,
const int HxW,
const float* X,
const float* scale,
const float* bias,
float* Y) {
ConstEigenVectorArrayMap<float> scale_arr(scale, C);
ConstEigenVectorArrayMap<float> bias_arr(bias, C);
const int stride = C * HxW;
const float* X_ptr = X;
float* Y_ptr = Y;
// Do Y = X * scale + bias
for ([[maybe_unused]] const auto i : c10::irange(N)) {
for (const auto j : c10::irange(C)) {
for (const auto k : c10::irange(HxW)) {
Y_ptr[HxW * j + k] = bias[j];
}
std::vector<float> s2(HxW, scale[j]);
fake_fp16::fma_fp16(
HxW, X_ptr + j * HxW, s2.data(), Y_ptr + HxW * j); // b2.data());
}
X_ptr += stride;
Y_ptr += stride;
}
fbgemm::RoundToFloat16(
Y, Y, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
}
const bool is_test_;
float epsilon_;
const StorageOrder order_;
const int num_batches_;
Tensor alpha_;
Tensor beta_;
INPUT_TAGS(
INPUT,
SCALE,
BIAS,
EST_MEAN,
EST_VAR,
BATCH_MEAN_SUM,
BATCH_VAR_SUM);
OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_STD);
}; // namespace caffe2
} // namespace caffe2

View File

@ -1,69 +0,0 @@
#pragma once
#include <caffe2/core/operator.h>
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
template <class Context>
class SumFP16FP16AccOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(SumFP16FP16AccOp);
bool DoRunWithFloat() {
auto& input0 = Input(0);
size_t N = input0.numel();
auto* output = Output(0, input0.sizes(), at::dtype<float>());
// Dimension checking
for (const auto i : c10::irange(1, InputSize())) {
if (output->sizes() != Input(i).sizes()) {
CAFFE_THROW(
"Check failed: output->sizes() == Input(i).sizes().",
"Description: Input #",
i,
", input dimension:",
Input(i).sizes(),
" should match output dimension: ",
output->sizes());
}
}
float* output_data = output->template mutable_data<float>();
memset(output_data, 0, sizeof(float) * input0.numel());
std::vector<float> t1(N);
std::vector<float> t2(N);
for (const auto i : c10::irange(InputSize())) {
fbgemm::RoundToFloat16(
Input(i).template data<float>(),
t1.data(),
N,
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
fbgemm::RoundToFloat16(
output_data, t2.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
math::Add(N, t1.data(), t2.data(), output_data, &context_);
}
fbgemm::RoundToFloat16(
output_data, output_data, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
bool RunOnDevice() override {
if (Input(0).template IsType<float>()) {
return DoRunWithFloat();
} else {
CAFFE_THROW(
"Sum operator only supports 32-bit float, but",
" input was of type ",
Input(0).dtype().name());
}
}
};
} // namespace caffe2

View File

@ -1,53 +0,0 @@
# How to run FakeLowP vs Glow tests
This was tested on Ubuntu 16.04 LTS but should work in general Linux system. The tested compiler is Clang-8.
## Build Glow Onnxifi Library
Follow https://github.com/pytorch/glow/blob/master/README.md to install the dependency of Glow. Then at glow root run
```
mkdir build && cd build
cmake -G Ninja -DGLOW_BUILD_ONNXIFI_DYNLIB=ON ..
ninja all
```
Note that here you probably want to add other flags like `-DGLOW_WITH_NNPI=1` to enable specific backend if you have the flow set up. Also, make sure you have the LD_LIBRARY_PATH set correctly pointing to libomp.so path when compiling with -DGLOW_WITH_NNPI=1.
```
export LD_LIBRARY_PATH=/usr/lib/llvm-8/lib
```
Once built successfully, you will get an dynamic library at `build/lib/Onnxifi/libonnxifi.so`. We will use it later.
## Build and Install PyTorch
Follow https://github.com/pytorch/pytorch/blob/main/README.md to install the dependency of PyTorch. It might be easy to
setup a python virtualenv or conda. And please use Python > 3.5.2 because hypothesis library will expose a bug in Python which
is fixed after 3.5.2. Something like 3.7 might be good enough. You can install python3.7 with
```
sudo apt-get install -y build-essential checkinstall libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev zlib1g-dev openssl libffi-dev python3-dev python3-setuptools wget
wget https://www.python.org/ftp/python/3.7.4/Python-3.7.4.tgz && tar -xf Python-3.7.4.tgz
cd Python-3.7.4
./configure && make -j 8 && sudo make altinstall
```
Once you installed Python 3.7, here I give a virtualenv flow:
```
sudo pip3.7 install virtualenv
python3.7 -m venv venv3
source venv3/bin/activate
cd pytorch
pip install -r requirements.txt
pip install pytest hypothesis protobuf
```
You probably need to install gflags-dev too with
```
sudo apt-get install libgflags-dev
```
Once you have all the dependency libs installed, build PyTorch with FakeLowP op support
```
USE_CUDA=0 USE_ROCM=0 USE_FAKELOWP=ON DEBUG=1 CMAKE_BUILD_TYPE=Debug USE_GFLAGS=1 USE_GLOG=1 USE_MKLDNN=0 BUILD_TEST=0 python setup.py install
```
The key options here are `USE_FAKELOWP=ON` which enables building of FakeLowP operators and `USE_GFLAGS=1` which enables gflags as we
use gflags in Glow to pass options. Other flags are mostl for fast build time and debug purpose.
## Run the test
You can now run the tests with command like the following when you are inside the virtual python env:
```
OSS_ONNXIFI_LIB=${PATH_TO_GLOW}/build/lib/Onnxifi/libonnxifi.so pytest pytorch/caffe2/contrib/fakelowp/test --hypothesis-show-statistics
```

View File

@ -1,108 +0,0 @@
# mypy: ignore-errors
import numpy as np
import unittest
import caffe2.python.fakelowp.init_shared_libs # noqa
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import datetime
from hypothesis import given, settings
import hypothesis.strategies as st
import caffe2.python.serialized_test.serialized_test_util as serial
core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
class TestBatchMatMul(serial.SerializedTestCase):
@given(
C=st.integers(min_value=1, max_value=10),
M=st.integers(min_value=1, max_value=50),
K=st.integers(min_value=1, max_value=512),
N=st.integers(min_value=1, max_value=50),
rand_seed=st.integers(0, 65534),
trans_a=st.booleans(),
trans_b=st.booleans(),
run_ints=st.booleans()
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_batch_matmul(self, M, K, N, C, rand_seed, trans_a, trans_b, run_ints):
np.random.seed(rand_seed)
workspace.ResetWorkspace()
batch_dims = [C]
if run_ints:
X = np.random.randint(low=1, high=3, size=((C, M, K))).astype(np.float32)
else:
X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype(np.float32) - 0.5)
if trans_a:
X = X.swapaxes(-1, -2)
if run_ints:
Y = np.random.randint(low=1, high=3, size=((C, K, N))).astype(np.float32)
else:
Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype(np.float32) - 0.5)
if trans_b:
Y = Y.swapaxes(-1, -2)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", "Y"])
pred_net.external_output.append("out")
pred_net.op.add().CopyFrom(
core.CreateOperator(
'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
)
)
pred_net_ref = core.Net("pred_net_ref")
# Reference updated to fp16 with fp32 accumulation
pred_net_ref.BatchMatMulFP16Acc32Fake(
["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b)
print("dims", batch_dims, X.shape, Y.shape)
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
{"X": X.shape, "Y": Y.shape},
debug=True,
adjust_batch=False,
use_onnx=False)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("X", X)
workspace.FeedBlob("Y", Y)
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(pred_net_ref)
# Run Glow net
workspace.RunNet(pred_net_onnxified.name)
out_glow = workspace.FetchBlob('out')
# Run caffe2 net
workspace.RunNet(pred_net_ref)
out_c2_fakefp16 = workspace.FetchBlob('out')
diff = np.abs(out_c2_fakefp16 - out_glow)
if not np.allclose(out_glow, out_c2_fakefp16):
print_test_debug_info("bmm", {
"seed": rand_seed,
"m": M, "k": K,
"n": N, "X": X.shape, "Y": Y.shape,
"trans_a": trans_a,
"trans_b": trans_b,
"run_ints": run_ints,
"out_glow": out_glow,
"out_c2_fakefp16": out_c2_fakefp16,
"diff": diff
})
assert(0)
if __name__ == "__main__":
unittest.main()

View File

@ -1,143 +0,0 @@
import numpy as np
import unittest
import caffe2.python.fakelowp.init_shared_libs # noqa
from hypothesis import given, settings
from hypothesis import strategies as st
from caffe2.proto import caffe2_pb2
from caffe2.python import core
from caffe2.python import workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
import datetime
core.GlobalInit(["caffe2", "--glow_global_fp16=1",
"--glow_global_fused_scale_offset_fp16=1",
"--glow_global_force_sls_fp16_accum=1"])
GLOW_LOWERED_BATCHNORM = False
def reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order):
X = X.astype(np.float16)
scale = scale.astype(np.float16)
bias = bias.astype(np.float16)
mean = mean.astype(np.float16)
# var = var.astype(np.float16)
assert(order == "NCHW")
scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
var = var[np.newaxis, :, np.newaxis, np.newaxis]
Y = ((X - mean) * (scale / np.sqrt(var + epsilon).astype(np.float16))) + bias
return Y.astype(np.float32)
# Test the lowered BN op
class BatchnormTest(serial.SerializedTestCase):
# TODO: using hypothesis seed, sweep dimensions
@given(seed=st.integers(0, 65535),
size=st.integers(2, 30),
input_channels=st.integers(2, 40),
batch_size=st.integers(2, 20))
@settings(deadline=datetime.timedelta(seconds=10))
def test_bn(self, seed, size, input_channels, batch_size):
workspace.ResetWorkspace()
np.random.seed(seed)
order = "NCHW"
epsilon = 1e-3
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SpatialBN",
["X", "scale", "bias", "mean", "var"],
["Y"],
order=order,
is_test=True,
epsilon=epsilon
)
)
if GLOW_LOWERED_BATCHNORM:
refopname = "SpatialBNFakeLoweredFp16NNPI"
else:
refopname = "SpatialBNFakeFp16NNPI"
pred_net_ref = caffe2_pb2.NetDef()
pred_net_ref.name = "pred"
pred_net_ref.external_input.extend(["X", "scale", "bias", "mean", "var"])
pred_net_ref.external_output.append("X")
pred_net_ref.op.add().CopyFrom(
core.CreateOperator(
refopname,
["X", "scale", "bias", "mean", "var"],
["Y"],
order=order,
is_test=True,
epsilon=epsilon
)
)
scale = np.random.rand(input_channels).astype(np.float32) + 0.5
bias = np.random.rand(input_channels).astype(np.float32) - 0.5
mean = np.random.randn(input_channels).astype(np.float32)
var = np.random.rand(input_channels).astype(np.float32) + 0.5
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
workspace.FeedBlob("scale", scale)
workspace.FeedBlob("bias", bias)
workspace.FeedBlob("mean", mean)
workspace.FeedBlob("var", var)
# Use for reference to debug
# Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order)
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{"X": [batch_size, input_channels, size, size],
"scale": [input_channels],
"bias": [input_channels],
"mean": [input_channels],
"var": [input_channels]},
debug=True,
adjust_batch=False,
use_onnx=False
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("X", X)
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(pred_net_ref)
workspace.RunNet(pred_net_ref.name)
Y_c2 = workspace.FetchBlob("Y")
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
diff = np.abs(Y_glow - Y_c2).astype(np.float16)
print_test_debug_info(
"bn",
{
"seed": seed,
"scale": scale,
"bias": bias,
"mean": mean,
"var": var,
"Y_np": Y_c2,
"Y_glow": Y_glow,
"diff": diff,
"rowwise_diff": np.max(np.abs(diff), -1)})
assert(0)

View File

@ -1,142 +0,0 @@
# Must happen before importing caffe2.python.*
import caffe2.python.fakelowp.init_shared_libs # noqa
import datetime
import numpy as np
from hypothesis import given, settings, example
from hypothesis import strategies as st
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
# Test that parallel chunks behave the same way as the serial one
workspace.GlobalInit(
[
"caffe2",
"--glow_global_fp16=1",
"--glow_global_fused_scale_offset_fp16=1",
"--glow_global_force_sls_fp16_accum=1",
"--glow_nnpi_num_parallel_chunks=2",
"--glow_use_dag_optimizer=false",
"--glow_dump_graph=true",
]
)
class Fusions(serial.SerializedTestCase):
def _get_scale_zp(self, tensor):
tensor_max = np.max(tensor)
tensor_min = min(0, np.min(tensor))
scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
if scale < 1e-6:
scale = np.float32(1e-6)
zero_point = 0 - tensor_min / scale
zero_point = int(round(np.clip(zero_point, 0, 255.0)))
return (scale, zero_point)
@given(
scale=st.floats(1e-4, 1e2),
zp=st.integers(-128, 128),
rand_seed=st.integers(0, 65534),
m=st.integers(32, 64),
k=st.integers(1000, 6000),
n=st.integers(200, 600),
)
# @example(m=64, k=5423, n=553, scale=1e-3, zp=120, rand_seed=1)
@settings(deadline=datetime.timedelta(seconds=1000), max_examples=1)
def test_ParallelFC(self, m, k, n, scale, zp, rand_seed):
np.random.seed(rand_seed)
workspace.ResetWorkspace()
# Y = W_T * X + b
X_fp32 = np.random.uniform(-1, 1, size=(m, k)).astype(np.float16) \
.astype(np.float32)
W_fp32 = np.random.uniform(-1, 1, size=(n, k)).astype(np.float32)
b_fp32 = np.zeros((n,), dtype=np.float32)
X_scale, X_zero_point = self._get_scale_zp(X_fp32)
workspace.FeedBlob("X", X_fp32)
workspace.FeedBlob("W", W_fp32)
workspace.FeedBlob("b", b_fp32)
workspace.RunOperatorOnce(
core.CreateOperator(
"Int8FCPackWeight",
["W"],
["W_int8"],
engine="DNNLOWP",
save_unpacked_weights=True,
in_scale=X_scale,
)
)
ref_net = core.Net("net")
ref_net.Int8QuantizeNNPI(
["X"],
["X_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point
)
ref_net.Int8FCFakeAcc32NNPI(
["X_int8", "W_int8", "b"],
["Y_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point,
)
ref_net.Int8Relu(
["Y_int8"],
["Y_relu"],
Y_zero_point=X_zero_point,
Y_scale=X_scale,
)
ref_net.Int8DequantizeNNPI(
["Y_relu"],
["Y"]
)
ref_net.Proto().external_output.append("Y")
# run ref_net
workspace.RunNetOnce(ref_net)
Y_fbgemm = workspace.FetchBlob("Y")
# run onnxifi net
ref_net.Proto().op[0].type = "Int8Quantize"
ref_net.Proto().op[1].type = "Int8FC"
ref_net.Proto().op[2].type = "Int8Relu"
ref_net.Proto().op[3].type = "Int8Dequantize"
net_onnxified = onnxifi_caffe2_net(
ref_net.Proto(),
{},
debug=True,
adjust_batch=False,
use_onnx=False,
weight_names=["W_int8", "b"],
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
)
print(net_onnxified)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.CreateNet(net_onnxified)
workspace.RunNet(net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
if not np.allclose(Y_glow, Y_fbgemm):
diff_Y = np.abs(Y_glow - Y_fbgemm)
print_test_debug_info(
"int8_fc",
{
"seed": rand_seed,
"n": n,
"X": X_fp32,
"W": W_fp32,
"b": b_fp32,
"Y_fbgemm": Y_fbgemm,
"Y_glow": Y_glow,
"diff": diff_Y,
"maxdiff": diff_Y.max(axis=1),
},
)
assert 0

View File

@ -1,159 +0,0 @@
import numpy as np
import caffe2.python.fakelowp.init_shared_libs # noqa
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
import datetime
from hypothesis import settings
core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
class DeqSwishQuantTest(serial.SerializedTestCase):
def _get_scale_zp(self, tensor):
tensor_max = np.max(tensor)
tensor_min = min(0, np.min(tensor))
scale = np.float32(np.float16((tensor_max - tensor_min) / 255.))
zero_point = -tensor_min / scale
zero_point = int(round(np.clip(zero_point, 0, 255.0)))
return (scale, zero_point)
def _sigmoid(self, x):
return 1. / (1. + np.exp(np.float32(-x)))
def _swish(self, x):
return np.float32(x) * self._sigmoid(x)
@settings(deadline=datetime.timedelta(seconds=10))
def test_swish_int8(self):
np.random.seed(0)
workspace.ResetWorkspace()
n = 256
X_fp32 = np.linspace(-20.5, 8., num=n).astype(np.float32).reshape(1, n)
Y_fp32 = self._swish(X_fp32)
X_scale, X_zero_point = self._get_scale_zp(X_fp32)
Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)
W_fp32 = np.identity(n, dtype=np.float32)
b_fp32 = np.zeros((n,), dtype=np.float32)
workspace.FeedBlob("X", X_fp32)
workspace.FeedBlob("W", W_fp32)
workspace.FeedBlob("b", b_fp32)
workspace.RunOperatorOnce(
core.CreateOperator(
"Int8FCPackWeight",
["W"],
["W_int8"],
engine="DNNLOWP",
save_unpacked_weights=True,
in_scale=X_scale,
)
)
ref_net1 = core.Net("net")
ref_net1.Int8QuantizeNNPI(
["X"],
["X_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point
)
ref_net1.Int8FCFakeAcc32NNPI(
["X_int8", "W_int8", "b"],
["U_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point,
)
ref_net1.SwishFakeInt8NNPI(
["U_int8"],
["Y"],
X_scale=X_scale,
X_zero_point=X_zero_point,
Y_scale=Y_scale,
Y_zero_point=Y_zero_point
)
ref_net1.Proto().external_output.append("Y")
ref_net = core.Net("net")
ref_net.Int8QuantizeNNPI(
["X"],
["X_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point
)
ref_net.Int8FCFakeAcc32NNPI(
["X_int8", "W_int8", "b"],
["U_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point,
)
ref_net.Int8DequantizeNNPI(
["U_int8"],
["U_fp16"],
UsingOneOverScale=False
)
ref_net.SwishFakeFp16NNPI(
["U_fp16"],
["Y_fp16"]
)
ref_net.Int8QuantizeNNPI(
["Y_fp16"],
["Y"],
Y_scale=Y_scale,
Y_zero_point=Y_zero_point
)
ref_net.Proto().external_output.append("Y")
# run ref_net
workspace.RunNetOnce(ref_net1)
Y_fbgemm = workspace.FetchInt8Blob("Y")
# run onnxifi net
ref_net.Proto().op[0].type = "Int8Quantize"
ref_net.Proto().op[1].type = "Int8FC"
ref_net.Proto().op[2].type = "Int8Dequantize"
ref_net.Proto().op[3].type = "Swish"
ref_net.Proto().op[4].type = "Int8Quantize"
net_onnxified = onnxifi_caffe2_net(
ref_net.Proto(),
{},
debug=True,
adjust_batch=False,
use_onnx=False,
weight_names=["W_int8", "b"],
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
)
np.testing.assert_equal(num_onnxified_ops, 1)
# TODO: add an assertion to check the optimized net
# fused Dequantize->Swish->Quantize to QuantizedSwish
workspace.CreateNet(net_onnxified)
workspace.RunNet(net_onnxified.name)
Y_glow = workspace.FetchInt8Blob("Y")
U_int8 = workspace.FetchInt8Blob("U_int8")
diff_Y = np.abs(Y_glow.data - Y_fbgemm.data)
num_mismatches = np.count_nonzero(diff_Y)
max_diff = np.max(diff_Y)
if max_diff > 0 or Y_glow.scale != Y_fbgemm.scale or \
Y_glow.zero_point != Y_fbgemm.zero_point:
print_test_debug_info(
"QuantizedSwish",
{
"X": X_fp32,
"X_scale": X_scale,
"X_zero_point": X_zero_point,
"Y_scale": Y_scale,
"Y_zero_point": Y_zero_point,
"U_int8": U_int8,
"Y_fbgemm": Y_fbgemm,
"Y_glow": Y_glow,
"diff": diff_Y,
"max_diff": max_diff,
"num_mismatches": num_mismatches,
},
)
assert 0

View File

@ -1,357 +0,0 @@
import numpy as np
import unittest
import caffe2.python.fakelowp.init_shared_libs # noqa
from hypothesis import given, settings
from hypothesis import strategies as st
from caffe2.proto import caffe2_pb2
from caffe2.python import core
from caffe2.python import workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import datetime
import caffe2.python.serialized_test.serialized_test_util as serial
core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
GLOW_MATMUL_RTOL = 0
class FCTest(serial.SerializedTestCase):
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_clip(self, seed):
np.random.seed(seed)
m, n, k = 8, 8, 8
dtype = np.float32
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", "W0", "b0", "W1", "b1"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"FC",
["X", "W0", "b0"],
["X1"],
)
)
pred_net.op.add().CopyFrom(
core.CreateOperator(
"FC",
["X1", "W1", "b1"],
["Y"],
)
)
workspace.GlobalInit(
['caffe2', '--caffe2_log_level=0', '--glow_global_fp16=1',
'--glow_clip_fp16', '--glow_global_fp16_constants=1'])
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.ResetWorkspace()
W0 = np.full((n, k), 65536.0, dtype)
b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
W1 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
b1 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
workspace.FeedBlob("W0", W0)
workspace.FeedBlob("b0", b0)
workspace.FeedBlob("W1", W1)
workspace.FeedBlob("b1", b1)
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{"X": (m, k)},
debug=True,
adjust_batch=False,
use_onnx=False
)
X = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
workspace.FeedBlob("X", X)
workspace.CreateNet(pred_net_onnxified)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
np.testing.assert_allclose(Y_glow, np.full((m, n), 65504.0, dtype))
@given(
m=st.integers(4, 50),
k=st.integers(4, 50),
n=st.integers(4, 50),
seed=st.integers(0, 65534)
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_fc_exercise(self, m, k, n, seed):
""" Test that the matmul engine is working, this doesn't test
precision
"""
np.random.seed(seed)
dtype = np.float32
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", "W0", "b0"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"FC",
["X", "W0", "b0"],
["Y"],
)
)
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.ResetWorkspace()
W0 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
workspace.FeedBlob("W0", W0)
workspace.FeedBlob("b0", b0)
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
{"X": (m, k)},
debug=True,
adjust_batch=False,
use_onnx=False)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
workspace.FeedBlob("X", X0)
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(pred_net)
num_iterations = 2
for _ in range(num_iterations):
X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
workspace.FeedBlob("X", X0)
# Run Glow net
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob('Y')
# Run caffe2 net
workspace.RunNet(pred_net.name)
Y_c2 = workspace.FetchBlob('Y')
if not np.allclose(Y_c2, Y_glow):
print_test_debug_info("fc", {
"seed": seed,
"m": m,
"k": k,
"n": n,
"X": X0,
"W0": W0,
"b0": b0,
"Y_glow": Y_glow,
"Y_c2": Y_c2,
"diff": np.abs((Y_c2 - Y_glow) / Y_c2)})
assert(0)
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_fc_numeric_cases(self, seed):
""" Test numerics, use examples found from the unit test.
Use Fp16FCAcc16NNPI as a reference.
"""
np.random.seed(seed)
m = 1
k = 20
n = 1
dtype = np.float32
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", "W0", "b0"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"FC",
["X", "W0", "b0"],
["Y"],
)
)
pred_net_ref = caffe2_pb2.NetDef()
pred_net_ref.name = "pred"
pred_net_ref.external_input.extend(["X", "W0", "b0"])
pred_net_ref.external_output.append("Y")
pred_net_ref.op.add().CopyFrom(
core.CreateOperator(
"Fp16FCAcc32NNPI",
["X", "W0", "b0"],
["Y"],
)
)
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.ResetWorkspace()
W0 = np.array([[0.04882812, 0.21520996, 0.1027832, 0.04489136,
-0.07635498, 0.14587402,
-0.06240845, 0.3918457, 0.46362305, -0.11657715,
0.29174805, 0.02890015,
0.0680542, 0.4255371, -0.42895508, -0.4128418,
-0.47973633, 0.33251953,
0.27807617, 0.3701172]], dtype=np.float32)
b0 = np.array([0.47851562], dtype=np.float32)
workspace.FeedBlob("W0", W0)
workspace.FeedBlob("b0", b0)
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
{"X": (m, k)},
debug=True,
adjust_batch=False,
use_onnx=False)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
X_inputs = [
np.array([[
-2.94921875e-01, -3.58642578e-01, -1.92871094e-01,
2.81250000e-01, -1.30126953e-01, 2.32696533e-02,
-4.55566406e-01, -2.31811523e-01, -1.95190430e-01,
-7.76977539e-02, -1.29394531e-01, 2.94677734e-01,
8.96453857e-04, 4.97314453e-01, -6.07604980e-02,
2.55371094e-01, 3.49853516e-01, -1.37695312e-01,
2.95410156e-01, -3.67187500e-01]], dtype=np.float32),
np.array([[
-0.4494629, -0.22192383, -0.1640625, 0.11480713,
-0.09851074, -0.02084351,
0.19091797, -0.17468262, -0.47485352, 0.07489014,
0.03897095, 0.00197601,
0.02835083, -0.27294922, 0.26757812, -0.20996094,
-0.31103516, -0.41601562,
0.09918213, -0.07696533]], dtype=np.float32),
np.array([[
0.01150513, -0.20507812, 0.46704102, 0.00906372,
0.19848633, 0.3720703,
0.46557617, -0.47436523, -0.35107422, -0.0362854,
-0.20812988, 0.41918945,
0.09716797, 0.19897461, 0.3876953, -0.0165863,
0.23535156, 0.29956055,
0.24389648, -0.23486328]], dtype=np.float32)
]
# keep onnxifi happy by feeding something with a shape
workspace.FeedBlob("X", X_inputs[0])
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(pred_net_ref)
for i in range(len(X_inputs)):
workspace.FeedBlob("X", X_inputs[i])
# Run Glow net
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob('Y')
workspace.RunNet(pred_net_ref.name)
Y_c2 = workspace.FetchBlob('Y')
diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
rowdiff = np.max(diff, axis=1)
n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
if n_offenders > 0:
print_test_debug_info("fc", {
"seed": seed,
"iter": i,
"m": m,
"k": k,
"n": n,
"W0": W0,
"b0": b0,
"Y_glow": Y_glow,
"Y_c2": Y_c2,
"diff": diff,
"rowdiff": rowdiff})
assert(0)
@given(
m=st.integers(1, 50),
k=st.integers(1, 1000),
n=st.integers(1, 50),
seed=st.integers(0, 65534),
use_packed=st.integers(0, 2)
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_fc_num0(self, seed, m, k, n, use_packed):
""" Test numerics, fix a dimension and determine the ranges of error.
Use Fp16FCAcc16 as a reference.
"""
W = "W_packed" if use_packed else "W0"
dtype = np.float32
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", W, "b0"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"FbFCPacked" if use_packed else "FC",
["X", W, "b0"],
["Y"],
)
)
pred_net_ref = caffe2_pb2.NetDef()
pred_net_ref.name = "pred"
pred_net_ref.external_input.extend(["X", W, "b0"])
pred_net_ref.external_output.append("Y")
pred_net_ref.op.add().CopyFrom(
core.CreateOperator(
"Fp16FCAcc32NNPI",
["X", W, "b0"],
["Y"],
)
)
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.ResetWorkspace()
W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype(np.float32)
b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype(np.float32)
workspace.FeedBlob("W0", W0)
workspace.FeedBlob("b0", b0)
workspace.RunOperatorOnce(
core.CreateOperator(
"FbGemmPack",
['W0'],
['W_packed'],
no_packing=True,
)
)
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
{"X": (m, k)},
debug=True,
adjust_batch=False,
use_onnx=False)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
X0 = np.random.rand(m, k).astype(dtype) - 0.5
workspace.FeedBlob("X", X0)
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(pred_net_ref)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob('Y')
# Run caffe2 net
workspace.RunNet(pred_net_ref.name)
Y_c2 = workspace.FetchBlob('Y')
diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
rowdiff = np.max(diff, axis=1)
n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
if n_offenders > 0:
print_test_debug_info("fc", {
"seed": seed,
"use_packed": use_packed,
"m": m,
"k": k,
"n": n,
"X": X0.shape,
"W0": W0.shape,
"b0": b0.shape,
"Y_glow": Y_glow,
"Y_c2": Y_c2,
"diff": diff,
"rowdiff": rowdiff})
assert(0)
if __name__ == '__main__':
unittest.main()

View File

@ -1,99 +0,0 @@
# Must happen before importing caffe2.python.*
import caffe2.python.fakelowp.init_shared_libs # noqa
import datetime
import numpy as np
from hypothesis import given, settings
from hypothesis import strategies as st
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
workspace.GlobalInit(
[
"caffe2",
"--glow_global_fp16=1",
"--glow_global_fused_scale_offset_fp16=1",
"--glow_global_force_sls_fp16_accum=1",
]
)
class Fusions(serial.SerializedTestCase):
@given(
scale=st.floats(1e-4, 1e2),
zp=st.integers(-128, 128),
size=st.integers(1, 100000),
rand_seed=st.integers(0, 65534),
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_tanhquantize(self, scale, zp, size, rand_seed):
np.random.seed(rand_seed)
workspace.ResetWorkspace()
pred_net = caffe2_pb2.NetDef()
pred_net.name = "ref"
pred_net.external_input.append("X")
pred_net.external_output.append("Y_q")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"Tanh", ["X"], ["Y"]
)
)
pred_net.op.add().CopyFrom(
core.CreateOperator(
"Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
)
)
X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32)
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{"X": X.shape},
debug=True,
adjust_batch=False,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("X", X)
workspace.CreateNet(pred_net_onnxified)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchInt8Blob("Y_q")
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.append("X")
ref_net.external_output.append("Y_q")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
)
)
workspace.CreateNet(ref_net)
workspace.RunNet(ref_net.name)
Y_ref = workspace.FetchInt8Blob("Y_q")
if not np.array_equal(Y_ref.data, Y_glow.data) or \
not Y_ref.scale == Y_glow.scale or \
not Y_ref.zero_point == Y_glow.zero_point:
print_test_debug_info(
"tanhfusion",
{
"scale": scale,
"zp": zp,
"input": X,
"ideal nonquant": np.tanh(X),
"Y_glow": Y_glow,
"Y_c2": Y_ref,
}
)
assert(0)

View File

@ -1,322 +0,0 @@
import caffe2.python.fakelowp.init_shared_libs # noqa
import numpy as np
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from hypothesis import given, strategies as st, settings
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
import datetime
core.GlobalInit(["caffe2",
"--caffe2_log_level=-3",
"--glow_global_fp16=1",
"--glow_clip_quant_range_to_fp16=1",
"--glow_global_fp16_constants=1"
])
class Int8OpsTest(serial.SerializedTestCase):
def _get_scale_zp(self, tensor):
tensor_max = np.max(tensor)
tensor_min = min(0, np.min(tensor))
scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
if scale < 1e-6:
scale = np.float32(1e-6)
zero_point = 0 - tensor_min / scale
zero_point = int(round(np.clip(zero_point, 0, 255.0)))
return (scale, zero_point)
@given(
n=st.integers(2, 1024),
rand_seed=st.integers(0, 65534),
non_zero_offset=st.booleans()
)
@settings(deadline=datetime.timedelta(seconds=50))
def test_int8_quantize(self, n, rand_seed, non_zero_offset):
print("n={}, rand_seed={}".format(n, rand_seed))
np.random.seed(rand_seed)
workspace.ResetWorkspace()
if non_zero_offset:
X_fp32 = np.random.uniform(-1, 1, size=(n, n)).astype(np.float16) \
.astype(np.float32)
else:
X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32)
W_fp32 = np.identity(n, dtype=np.float32)
b_fp32 = np.zeros((n,), dtype=np.float32)
X_scale, X_zero_point = self._get_scale_zp(X_fp32)
workspace.FeedBlob("X", X_fp32)
workspace.FeedBlob("W", W_fp32)
workspace.FeedBlob("b", b_fp32)
workspace.RunOperatorOnce(
core.CreateOperator(
"Int8FCPackWeight",
["W"],
["W_int8"],
engine="DNNLOWP",
save_unpacked_weights=True,
in_scale=X_scale,
)
)
ref_net = core.Net("net")
ref_net.Int8QuantizeNNPI(
["X"],
["X_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point
)
ref_net.Int8FCFakeAcc32NNPI(
["X_int8", "W_int8", "b"],
["Y_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point,
)
ref_net.Int8DequantizeNNPI(
["Y_int8"],
["Y"]
)
ref_net.Proto().external_output.append("Y")
# run ref_net
workspace.RunNetOnce(ref_net)
Y_fbgemm = workspace.FetchBlob("Y")
# run onnxifi net
ref_net.Proto().op[0].type = "Int8Quantize"
ref_net.Proto().op[1].type = "Int8FC"
ref_net.Proto().op[2].type = "Int8Dequantize"
net_onnxified = onnxifi_caffe2_net(
ref_net.Proto(),
{},
debug=True,
adjust_batch=False,
use_onnx=False,
weight_names=["W_int8", "b"],
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.CreateNet(net_onnxified)
workspace.RunNet(net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
if not np.allclose(Y_glow, Y_fbgemm):
diff_Y = np.abs(Y_glow - Y_fbgemm)
print_test_debug_info(
"int8_fc",
{
"seed": rand_seed,
"n": n,
"X": X_fp32,
"W": W_fp32,
"b": b_fp32,
"Y_fbgemm": Y_fbgemm,
"Y_glow": Y_glow,
"diff": diff_Y,
"maxdiff": diff_Y.max(axis=1),
},
)
assert 0
@given(
n=st.integers(1, 1024),
m=st.integers(1, 1024),
k=st.integers(1, 1024),
f=st.integers(1, 1), # TODO: figure a safe number to increase
rand_seed=st.integers(0, 65534),
quantize_bias=st.sampled_from([False]),
)
@settings(deadline=datetime.timedelta(seconds=50))
def test_int8_fc(
self, n, m, k, rand_seed, quantize_bias, f
):
print(
f"n={n}, m={m}, k={k}, rand_seed={rand_seed}, quantize_bias={quantize_bias}"
)
np.random.seed(rand_seed)
workspace.ResetWorkspace()
ff = float(f)
X_fp32 = np.random.uniform(-ff, ff, size=(m, k)).astype(np.float32)
W_fp32 = np.random.uniform(-ff, ff, size=(n, k)).astype(np.float32)
b_fp32 = np.random.uniform(-ff, ff, size=(n)).astype(np.float32)
X_scale, X_zero_point = self._get_scale_zp(X_fp32)
Y_fp32 = np.dot(X_fp32, W_fp32.T) + b_fp32
Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)
workspace.FeedBlob("X", X_fp32)
workspace.FeedBlob("W", W_fp32)
workspace.FeedBlob("b", b_fp32)
workspace.RunOperatorOnce(
core.CreateOperator(
"Int8FCPackWeight",
["W", "b"] if quantize_bias else ["W"],
["W_int8", "b_int32"] if quantize_bias else ["W_int8"],
engine="DNNLOWP",
save_unpacked_weights=True,
in_scale=X_scale,
)
)
ref_net = core.Net("net")
ref_net.Int8QuantizeNNPI(
["X"],
["X_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point
)
ref_net.Int8FCFakeAcc32NNPI(
["X_int8", "W_int8", "b_int32" if quantize_bias else "b"],
["Y_int8"],
Y_scale=Y_scale,
Y_zero_point=Y_zero_point,
)
ref_net.Int8DequantizeNNPI(
["Y_int8"],
["Y"]
)
ref_net.Proto().external_output.append("Y")
# run ref_net
workspace.RunNetOnce(ref_net)
Y_fbgemm = workspace.FetchBlob("Y")
# run onnxifi net
ref_net.Proto().op[0].type = "Int8Quantize"
ref_net.Proto().op[1].type = "Int8FC"
ref_net.Proto().op[2].type = "Int8Dequantize"
net_onnxified = onnxifi_caffe2_net(
ref_net.Proto(),
{},
debug=True,
adjust_batch=False,
use_onnx=False,
weight_names=["W_int8", "b_int32"] if quantize_bias else ["W_int8", "b"],
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.CreateNet(net_onnxified)
workspace.RunNet(net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
if not np.allclose(Y_glow, Y_fbgemm):
diff_Y = np.abs(Y_glow - Y_fbgemm)
print_test_debug_info(
"int8_fc",
{
"seed": rand_seed,
"n": n,
"m": m,
"k": k,
"X": X_fp32,
"W": W_fp32,
"b": b_fp32,
"Y_fbgemm": Y_fbgemm,
"Y_glow": Y_glow,
"diff": diff_Y,
"maxdiff": diff_Y.max(axis=1),
},
)
assert 0
@given(
n=st.integers(1, 4),
rand_seed=st.integers(0, 65534)
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_int8_small_input(self, n, rand_seed):
print("n={}, rand_seed={}".format(n, rand_seed))
np.random.seed(rand_seed)
workspace.ResetWorkspace()
X_fp32 = np.random.uniform(0.01, 0.03, size=(n, n)).astype(np.float32)
W_fp32 = np.identity(n, dtype=np.float32)
b_fp32 = np.zeros((n,), dtype=np.float32)
X_scale, X_zero_point = self._get_scale_zp(X_fp32)
workspace.FeedBlob("X", X_fp32)
workspace.FeedBlob("W", W_fp32)
workspace.FeedBlob("b", b_fp32)
workspace.RunOperatorOnce(
core.CreateOperator(
"Int8FCPackWeight",
["W"],
["W_int8"],
engine="DNNLOWP",
save_unpacked_weights=True,
in_scale=X_scale,
)
)
ref_net = core.Net("net")
ref_net.Int8QuantizeNNPI(
["X"],
["X_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point
)
ref_net.Int8FCFakeAcc32NNPI(
["X_int8", "W_int8", "b"],
["Y_int8"],
Y_scale=X_scale,
Y_zero_point=X_zero_point,
)
ref_net.Int8DequantizeNNPI(
["Y_int8"],
["Y"]
)
ref_net.Proto().external_output.append("Y")
# run ref_net
workspace.RunNetOnce(ref_net)
Y_fbgemm = workspace.FetchBlob("Y")
# run onnxifi net
ref_net.Proto().op[0].type = "Int8Quantize"
ref_net.Proto().op[1].type = "Int8FC"
ref_net.Proto().op[2].type = "Int8Dequantize"
net_onnxified = onnxifi_caffe2_net(
ref_net.Proto(),
{},
debug=True,
adjust_batch=False,
use_onnx=False,
weight_names=["W_int8", "b"],
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.CreateNet(net_onnxified)
workspace.RunNet(net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
if not np.allclose(Y_glow, Y_fbgemm):
diff_Y = np.abs(Y_glow - Y_fbgemm)
print_test_debug_info(
"int8_fc",
{
"seed": rand_seed,
"n": n,
"X": X_fp32,
"W": W_fp32,
"b": b_fp32,
"Y_fbgemm": Y_fbgemm,
"Y_glow": Y_glow,
"diff": diff_Y,
"maxdiff": diff_Y.max(axis=1),
},
)
assert 0

View File

@ -1,97 +0,0 @@
# Must happen before importing caffe2.python.*
import caffe2.python.fakelowp.init_shared_libs # noqa
import datetime
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
import caffe2.python.serialized_test.serialized_test_util as serial
from hypothesis import settings
workspace.GlobalInit(
[
"caffe2",
"--glow_global_fp16=0",
"--glow_global_fused_scale_offset_fp16=0",
"--glow_global_force_sls_fp16_accum=0",
]
)
class QuantTest(serial.SerializedTestCase):
@settings(deadline=datetime.timedelta(seconds=10))
def test_dequantize(self):
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.append("X")
pred_net.external_output.append("Y")
x_scale = 0.10000000149011612
pred_net.op.add().CopyFrom(
core.CreateOperator(
"Int8Quantize", ["X"], ["I"], Y_scale=x_scale, Y_zero_point=0
)
)
pred_net.op.add().CopyFrom(
core.CreateOperator(
"Int8Dequantize", ["I"], ["Y"],
)
)
print(pred_net)
X = np.asarray([[1, 0], [0, 1]]).astype(np.float32)
workspace.FeedBlob("X", X)
workspace.CreateNet(pred_net)
workspace.RunNet(pred_net.name)
Y_ref = workspace.FetchBlob("Y")
workspace.ResetWorkspace()
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{"X": [5, 2]},
debug=True,
adjust_batch=True,
block_list=[0],
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
)
np.testing.assert_equal(len(pred_net_onnxified.op), 2)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("X", X)
workspace.CreateNet(pred_net_onnxified)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
np.testing.assert_equal(Y_ref, Y_glow)
@settings(deadline=datetime.timedelta(seconds=20))
def test_quantize(self):
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.append("X")
pred_net.external_output.append("Y")
x_scale = 0.10000000149011612
pred_net.op.add().CopyFrom(
core.CreateOperator(
"Int8Quantize", ["X"], ["Y"], Y_scale=x_scale, Y_zero_point=0
)
)
print(pred_net)
X = np.asarray([[1, 0], [0, 1]]).astype(np.float32)
workspace.FeedBlob("X", X)
workspace.RunNetOnce(pred_net)
Y_ref = workspace.FetchInt8Blob("Y")
workspace.ResetWorkspace()
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{"X": [2, 2]},
debug=True,
adjust_batch=False,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("X", X)
workspace.CreateNet(pred_net_onnxified)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchInt8Blob("Y")
np.testing.assert_equal(Y_ref.data, Y_glow.data)

View File

@ -1,240 +0,0 @@
import numpy as np
import caffe2.python.fakelowp.init_shared_libs # noqa
from caffe2.proto import caffe2_pb2
from caffe2.python import core
from caffe2.python import workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
from hypothesis import given, settings
from hypothesis import strategies as st
import caffe2.python.serialized_test.serialized_test_util as serial
import datetime
core.GlobalInit(["caffe2",
"--glow_global_fp16=1",
"--glow_global_fused_scale_offset_fp16=1",
"--glow_global_force_sls_fp16_accum=1"])
GLOW_LOWERED_BATCHNORM = False
# Test the lowered LayerNorm op
class LayerNorm(serial.SerializedTestCase):
@given(seed=st.integers(0, 65535),
batch_size=st.integers(min_value=1, max_value=50),
size=st.integers(min_value=2, max_value=128),
epsilon=st.floats(min_value=1e-4, max_value=1e-3),
elementwise_affine=st.booleans())
@settings(deadline=datetime.timedelta(seconds=10))
def test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine):
np.random.seed(seed)
# Reset the workspace
workspace.ResetWorkspace()
axis = 1
dims = np.array(([batch_size, size]))
X = np.random.uniform(size=dims).astype(np.float32) - 0.5
gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", "gamma", "beta"])
pred_net.external_output.extend(["Y", "mean", "rstd"])
pred_net.op.add().CopyFrom(
core.CreateOperator(
"LayerNorm",
["X", "gamma", "beta"] if elementwise_affine else ["X"],
["Y", "mean", "rstd"],
axis=axis,
epsilon=epsilon,
elementwise_affine=elementwise_affine
)
)
pred_net_ref = caffe2_pb2.NetDef()
pred_net_ref.name = "pred_ref"
pred_net_ref.external_input.extend(["X", "gamma", "beta"])
pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
pred_net_ref.op.add().CopyFrom(
core.CreateOperator(
"LayerNormFakeFP16NNPI",
["X", "gamma", "beta"] if elementwise_affine else ["X"],
["Y", "mean", "rstd"],
axis=axis,
epsilon=epsilon,
elementwise_affine=elementwise_affine
)
)
shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
shape_hits,
debug=True,
adjust_batch=True,
use_onnx=False
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("X", X)
workspace.FeedBlob("gamma", gamma)
workspace.FeedBlob("beta", beta)
workspace.CreateNet(pred_net_ref)
workspace.CreateNet(pred_net_onnxified)
workspace.RunNet(pred_net_ref.name)
Y_c2 = workspace.FetchBlob("Y")
dims1 = np.array(([1, *dims]))
X_glow = X.reshape(dims1)
workspace.FeedBlob("X", X_glow)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
if not np.allclose(Y_glow, Y_c2):
diff_Y = np.abs(Y_glow - Y_c2)
print_test_debug_info(
"layernorm",
{
"seed": seed,
"size": size,
"batch_size": batch_size,
"epsilon": epsilon,
"gamma": gamma,
"beta": beta,
"elementwise_affine": elementwise_affine,
"X": X,
"Y_glow": Y_glow,
"Y_c2": Y_c2,
"diff_Y": diff_Y,
}
)
assert(0)
def _get_scale_zp(self, tensor):
tensor_max = np.max(tensor)
tensor_min = min(0, np.min(tensor))
scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
if scale < 1e-6:
scale = np.float32(1e-6)
zero_point = 0 - tensor_min / scale
zero_point = int(round(np.clip(zero_point, 0, 255.0)))
return (scale, zero_point)
def _layernorm_transform(self, X):
mean = np.mean(X, axis=1)
mean_exp = np.outer(mean, np.ones(X.shape[1]))
std = np.std(X, axis=1)
std_exp = np.outer(std, np.ones(X.shape[1]))
Y = (X - mean_exp) / std_exp
return Y
@given(seed=st.integers(0, 65535),
batch_size=st.integers(min_value=1, max_value=50),
size=st.integers(min_value=2, max_value=128),
epsilon=st.floats(min_value=1e-4, max_value=1e-3),
elementwise_affine=st.booleans())
@settings(deadline=datetime.timedelta(seconds=10))
# re-enable when T74553975 gets fixed
def test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine):
np.random.seed(seed)
# Reset the workspace
workspace.ResetWorkspace()
axis = 1
dims = np.array(([batch_size, size]))
X = np.random.uniform(size=dims).astype(np.float32) - 0.5
gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
Y = self._layernorm_transform(X)
scale, zp = self._get_scale_zp(Y)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X", "gamma", "beta"])
pred_net.external_output.extend(["Y_q"])
pred_net.op.add().CopyFrom(
core.CreateOperator(
"LayerNorm",
["X", "gamma", "beta"] if elementwise_affine else ["X"],
["Y", "mean", "rstd"],
axis=axis,
epsilon=epsilon,
elementwise_affine=elementwise_affine
)
)
pred_net.op.add().CopyFrom(
core.CreateOperator(
"Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
)
)
print(pred_net)
pred_net_ref = caffe2_pb2.NetDef()
pred_net_ref.name = "pred_ref"
pred_net_ref.external_input.extend(["X", "gamma", "beta"])
pred_net_ref.external_output.extend(["Y_q"])
pred_net_ref.op.add().CopyFrom(
core.CreateOperator(
"LayerNormInt8QuantizeFakeNNPI",
["X", "gamma", "beta"] if elementwise_affine else ["X"],
["Y_q", "mean", "rstd"],
axis=axis,
epsilon=epsilon,
elementwise_affine=elementwise_affine,
Y_scale=scale, Y_zero_point=zp
)
)
shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
shape_hits,
debug=True,
adjust_batch=True,
use_onnx=False
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("X", X)
workspace.FeedBlob("gamma", gamma)
workspace.FeedBlob("beta", beta)
workspace.CreateNet(pred_net_ref)
workspace.CreateNet(pred_net_onnxified)
workspace.RunNet(pred_net_ref.name)
Y_c2 = workspace.FetchInt8Blob("Y_q")
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchInt8Blob("Y_q")
if not np.allclose(Y_glow.data, Y_c2.data) or \
Y_glow.scale != Y_c2.scale or Y_glow.zero_point != Y_c2.zero_point:
diff_Y = np.abs(Y_glow.data.astype(np.float32) - Y_c2.data.astype(np.float32))
print_test_debug_info(
"layernorm",
{
"seed": seed,
"size": size,
"batch_size": batch_size,
"epsilon": epsilon,
"gamma": gamma,
"beta": beta,
"elementwise_affine": elementwise_affine,
"X": X,
"Y_glow": Y_glow,
"Y_c2": Y_c2,
"diff_Y": diff_Y,
}
)
assert(0)

View File

@ -1,368 +0,0 @@
import numpy as np
import caffe2.python.fakelowp.init_shared_libs # noqa
import datetime
from hypothesis import given, settings
from hypothesis import strategies as st
from caffe2.proto import caffe2_pb2
from caffe2.python import core
from caffe2.python import workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
from caffe2.python.fakelowp.test_utils import compute_ulp_error
import caffe2.python.serialized_test.serialized_test_util as serial
core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
kEpsilon = 1e-8
class ArithmeticOpsTest(serial.SerializedTestCase):
def _test_binary_op_graph(self, name, seed):
np.random.seed(seed)
workspace.ResetWorkspace()
# First dimension is the batch size
dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3)))
A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
# Avoid dividing by 0
B[np.abs(B) < 1e-3] = 1e-3
print(A.shape, B.shape)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["A", "B"])
pred_net.external_output.append("C")
pred_net.op.add().CopyFrom(
core.CreateOperator(
name,
["A", "B"],
["C"]
)
)
pred_net_ref = caffe2_pb2.NetDef()
pred_net_ref.name = "ref"
pred_net_ref.external_input.extend(["A", "B"])
pred_net_ref.external_output.append("C_ref")
pred_net_ref.op.add().CopyFrom(
core.CreateOperator(
name + "FakeFp16",
["A", "B"],
["C_ref"],
)
)
shape_hints = {"A": A.shape, "B": B.shape}
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
shape_hints,
debug=True,
adjust_batch=True,
use_onnx=False)
print(pred_net_onnxified)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.FeedBlob("A", A)
workspace.FeedBlob("B", B)
workspace.CreateNet(pred_net_ref)
workspace.CreateNet(pred_net_onnxified)
num_iterations = 10
for _ in range(num_iterations):
A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
# Avoid dividing by 0
B[np.abs(B) < 1e-3] = 1e-3
workspace.FeedBlob("A", A)
workspace.FeedBlob("B", B)
# Run caffe2 net
workspace.RunNet(pred_net_ref.name)
Y_c2 = workspace.FetchBlob("C_ref")
# Run Glow net
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob("C")
Y_glow[Y_glow == np.Inf] = np.finfo(np.float16).max
Y_glow[Y_glow == np.NINF] = np.finfo(np.float16).min
# Ignore mismatches solely due to difference in precision
fp16_finite = np.isfinite(A.astype(np.float16) / B.astype(np.float16))
# Results should be identical since we are comparing with the C2 emulation
if not np.allclose(Y_c2[fp16_finite], Y_glow[fp16_finite]):
diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
print_test_debug_info(name, {
"dims": dims, "iter": _, "seed": seed, "A": A, "B": B,
"Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
assert(0)
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_add_graph(self, seed):
self._test_binary_op_graph("Add", seed)
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_sub_graph(self, seed):
self._test_binary_op_graph("Sub", seed)
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_mul_graph(self, seed):
self._test_binary_op_graph("Mul", seed)
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_div_graph(self, seed):
self._test_binary_op_graph("Div", seed)
class UnaryOpTest(serial.SerializedTestCase):
def _test_unary_op(self, opname, X, rtol=1e-5, atol=1e-8):
workspace.ResetWorkspace()
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.append("X")
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
opname,
['X'],
['Y'])
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.append("X")
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
opname + 'FakeFp16NNPI',
['X'],
['Y'])
)
print("REF NET = {}".format(ref_net))
shape_hints = {"X": X.shape}
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
shape_hints,
debug=True,
adjust_batch=False,
use_onnx=False)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.FeedBlob("X", X)
workspace.CreateNet(ref_net)
workspace.CreateNet(pred_net_onnxified)
# Run Glow net
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob('Y')
# Run caffe2 reference net
workspace.RunNet(ref_net.name)
Y_c2 = workspace.FetchBlob('Y')
if not np.allclose(Y_c2, Y_glow, rtol=atol, atol=atol):
diff = np.abs(Y_c2 - Y_glow)
np.save('/tmp/' + opname + 'diff', diff)
np.save('/tmp/' + opname + 'result', Y_c2)
print_test_debug_info(opname, {
"X": X,
"Y_c2": Y_c2,
"Y_glow": Y_glow,
"diff": diff
})
assert(0)
return Y_glow
def _test_op_w_ulp_error(self, seed, opname, regions, atol=0, err_threshold=2):
ulp_err = 0
for x0, x1 in regions:
X = np.linspace(x0, x1, num=1025, dtype=np.float16).astype(np.float32)
Y_glow = self._test_unary_op(opname, X, atol=atol)
region_err = compute_ulp_error(opname, X, Y_glow)
ulp_err = max(np.max(np.abs(region_err)), ulp_err)
if (ulp_err > err_threshold):
print(r'{} Op detected ulp_err={}'.format(opname, ulp_err))
assert(0)
# These tests doesn't need to run multiple times given that it is a
# linear sweep and it is deterministic.
# Once hypothesis.testing version is updated, we can re-enable
# testing with different hypothesis examples.
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=20))
def test_sigmoid(self, seed):
np.random.seed(seed)
opname = "Sigmoid"
regions = [[-8., -4.], [-4., -2.], [-2., -1.], [-1., -.5], [-.5, -.25],
[-.25, .25], [.25, .5], [.5, 1.], [1., 2.], [2., 4.],
[4., 8.]]
self._test_op_w_ulp_error(seed, opname, regions, atol=0, err_threshold=2.5)
# These tests doesn't need to run multiple times given that it is a
# linear sweep and it is deterministic.
# Once hypothesis.testing version is updated, we can re-enable
# testing with different hypothesis examples.
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=20))
def test_tanh(self, seed):
np.random.seed(seed)
opname = "Tanh"
regions = [[2.**(-9), 2.**(-8)], [2.**(-8), 2.**(-7)],
[2.**(-7), 2.**(-6)], [2.**(-6), 2.**(-5)],
[2.**(-5), 2.**(-4)], [2.**(-4), 2.**(-3)],
[2.**(-3), 2.**(-2)], [2.**(-2), 2.**(-1)],
[2.**(-1), 1.], [1., 2.], [2., 4.], [4., 8.]]
self._test_op_w_ulp_error(seed, opname, regions, atol=0, err_threshold=2)
# These tests doesn't need to run multiple times given that it is a
# linear sweep and it is deterministic.
# Once hypothesis.testing version is updated, we can re-enable
# testing with different hypothesis examples.
# TODO: move atol to 1e-8 once we get a non-lowered swish implementation
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_swish(self, seed):
np.random.seed(seed)
opname = "Swish"
regions = [[-20.5, -11.], [-11., -8.], [-8., -1.], [-1., -0.1],
[-1. / 8., 1. / 8.], [1. / 8, 5.], [5., 8.]]
self._test_op_w_ulp_error(seed, opname, regions, atol=0.008, err_threshold=384)
# These tests doesn't need to run multiple times given that it is a
# linear sweep and it is deterministic.
# Once hypothesis.testing version is updated, we can re-enable
# testing with different hypothesis examples.
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def test_logit(self, seed):
np.random.seed(seed)
workspace.ResetWorkspace()
n = 1
m = 15361
X = np.linspace(0, 1, num=m, dtype=np.float32)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.append("X")
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
'Logit',
['X'],
['Y'],
eps=1e-6)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.append("X")
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
'LogitFakeFp16NNPI',
['X'],
['Y'],
eps=1e-6)
)
print("REF NET = {}".format(ref_net))
shape_hints = {"X": (n, m)}
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
shape_hints,
debug=True,
adjust_batch=False,
use_onnx=False)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.FeedBlob("X", X)
workspace.CreateNet(ref_net)
workspace.CreateNet(pred_net_onnxified)
# Run Glow net
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob('Y')
# Run caffe2 reference net
workspace.RunNet(ref_net.name)
Y_c2 = workspace.FetchBlob('Y')
diff = np.abs(Y_c2 - Y_glow)
if np.nanmax(diff) > 9e-3:
np.save('/tmp/logit_diff', diff)
np.save('/tmp/logit_result', Y_c2)
print_test_debug_info('Logit', {
"X": X,
"Y_c2": Y_c2,
"Y_glow": Y_glow,
"diff": diff
})
assert(0)
class ReluTest(serial.SerializedTestCase):
@given(seed=st.integers(0, 65534))
@settings(deadline=datetime.timedelta(seconds=10))
def relu_test(self, inputs, gc, dc, seed):
np.random.seed(seed)
inputs = np.random.rand(1).astype(np.float32)
X = inputs[0]
# First dimension is the batch size
print(X.shape)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["X"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"Relu",
["X"],
["Y"]
)
)
pred_net_ref = caffe2_pb2.NetDef()
pred_net_ref.name = "ref"
pred_net_ref.external_input.extend(["X"])
pred_net_ref.external_output.append("Y_ref")
pred_net_ref.op.add().CopyFrom(
core.CreateOperator(
"ReluFakeFp16",
["X"],
["Y_ref"],
)
)
shape_hints = {"X": X.shape}
pred_net_onnxified = onnxifi_caffe2_net(pred_net,
shape_hints,
debug=True,
adjust_batch=True,
use_onnx=False)
print(pred_net_onnxified)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.SwitchWorkspace("glow_test_ws", True)
workspace.FeedBlob("X", X)
workspace.CreateNet(pred_net_ref)
workspace.CreateNet(pred_net_onnxified)
workspace.FeedBlob("X", X)
# Run caffe2 net
workspace.RunNet(pred_net_ref.name)
Y_c2 = workspace.FetchBlob("Y_ref")
# Run Glow net
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
# Results should be identical since we are comparing with the C2 emulation
if not np.allclose(Y_c2, Y_glow):
diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
print_test_debug_info("Relu", {
"seed": seed, "X": X,
"Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
assert(0)

View File

@ -1,215 +0,0 @@
import numpy as np
import unittest
# Must happen before importing caffe2.python.*
import caffe2.python.fakelowp.init_shared_libs # noqa
from hypothesis import given, settings
from hypothesis import strategies as st
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
import datetime
workspace.GlobalInit(["caffe2", "--glow_global_fp16=1",
"--glow_global_fused_scale_offset_fp16=1",
"--glow_global_force_sls_fp16_accum=1"])
class SparseLengthsSum4BitFakeNNPIFp16Test(serial.SerializedTestCase):
@given(seed=st.integers(0, 65535))
@settings(deadline=datetime.timedelta(seconds=10))
def test_slws_fused_4bit_rowwise_all_same(self, seed):
np.random.seed(seed)
workspace.ResetWorkspace()
n = 1
m = 2
data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
max_segments = 5
max_segment_length = 100
num_lengths = np.random.randint(1, max_segments + 1)
# number of segments to run
lengths = np.random.randint(0, max_segment_length + 1,
size=num_lengths).astype(np.int32)
num_indices = np.sum(lengths)
indices = np.zeros(num_indices, dtype=np.int64)
weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)])\
.astype(np.float32)
weights = np.ones(len(indices)).astype(np.float32)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused4BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"])
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused4BitRowwiseQuantized",
['data'],
['quantized_data']
)
)
print("quantized", workspace.FetchBlob("quantized_data"))
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=max_segments,
max_seq_size=max_segment_length,
debug=True,
adjust_batch=True,
use_onnx=False
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(ref_net)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob('Y')
workspace.RunNet(ref_net.name)
Y_c2 = workspace.FetchBlob('Y')
if not np.allclose(Y_c2, Y_glow):
print_test_debug_info(
"slws_fused_4bit_rowwise",
{"seed": seed,
"indices": indices,
"data": data,
"lengths": lengths,
"weights": weights,
"Y_c2": Y_c2,
"Y_glow": Y_glow,
"diff": Y_glow - Y_c2,
"rowwise_diff": (Y_glow - Y_c2)[:, 0]})
assert(0)
@given(
seed=st.integers(0, 65535),
num_rows=st.integers(2, 20),
embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 72, 128]),
batch_size=st.integers(1, 32),
max_weight=st.integers(0, 1),
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
workspace.ResetWorkspace()
np.random.seed(seed)
data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
data = data * 1e-3
lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
_indices = []
for length in lengths:
_indices.extend(np.random.choice(np.arange(1, num_rows), length))
indices = np.asarray(_indices).astype(np.int64)
weights = np.random.uniform(
low=0,
high=max_weight,
size=[len(indices)]
).astype(np.float32) - max_weight / 2.0
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"])
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused4BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"])
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused4BitRowwiseQuantized",
["data"],
["quantized_data"]
)
)
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=batch_size,
max_seq_size=np.max(lengths),
debug=True,
adjust_batch=True,
use_onnx=False
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(ref_net)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob('Y')
workspace.RunNet(ref_net.name)
Y_c2 = workspace.FetchBlob('Y')
if not np.allclose(Y_c2, Y_glow):
print_test_debug_info(
"slws_fused_4bit_rowwise",
{
"seed": seed,
"indices": indices,
"data": data.shape,
"lengths": lengths,
"weights": weights,
"Y_c2": Y_c2.shape,
"Y_glow": Y_glow.shape,
"diff": Y_glow - Y_c2,
"rowwise_diff": (Y_glow - Y_c2)[:, 0]
}
)
assert(0)
if __name__ == '__main__':
unittest.main()

View File

@ -1,566 +0,0 @@
import unittest
from typing import Dict, Any
# Must happen before importing caffe2.python.*
import caffe2.python.fakelowp.init_shared_libs # noqa
import datetime
import numpy as np
from hypothesis import given, settings
from hypothesis import strategies as st
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
workspace.GlobalInit(
[
"caffe2",
"--glow_global_fp16=1",
"--glow_global_fused_scale_offset_fp16=1",
"--glow_global_force_sls_fp16_accum=1",
]
)
GLOW_MATMUL_ATOL = 1e-5
GLOW_MATMUL_RTOL = 1e-3
class SparseLengthsSum8BitFakeNNPIFp16Test(serial.SerializedTestCase):
def Skip_test_SLS_NonQuantized_fp16(self):
N = 20000
DIM = 64
D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32)
I = (np.random.randint(0, N, size=12)).astype(np.int64)
L = np.asarray([4, 4, 4]).astype(np.int32)
workspace.FeedBlob("D", D)
ref_c2_net = core.Net("test_ref_c2")
ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out")
ref_c2_net.Proto().external_input.extend(["D", "I", "L"])
ref_c2_net.Proto().external_output.extend(["ref_out"])
fp16_c2_net = core.Net("test_fp16_c2")
fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out")
input_dict : Dict[Any, Any] = {}
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(["D", "I", "L"])
pred_net.external_output.append("glow_out")
pred_net.op.add().CopyFrom(
core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"])
)
onnxified_net = onnxifi_caffe2_net(
pred_net,
input_dict,
max_batch_size=3,
max_seq_size=16,
debug=True,
adjust_batch=False,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op
)
print(onnxified_net)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("I", I)
workspace.FeedBlob("L", L)
workspace.RunNetOnce(ref_c2_net)
ref_c2_out = workspace.FetchBlob("ref_out")
workspace.RunNetOnce(fp16_c2_net)
fp16_c2_out = workspace.FetchBlob("fp16_out")
np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3)
workspace.RunNetOnce(onnxified_net)
fp16_glow_out = workspace.FetchBlob("glow_out")
if not np.allclose(fp16_glow_out, fp16_c2_out):
diff = np.abs(fp16_glow_out - fp16_c2_out)
print_test_debug_info(
"sls",
{
"indices": I,
"data": D,
"lengths": L,
"Y_c2": fp16_c2_out,
"Y_glow": fp16_glow_out,
"diff": diff,
"rowwise_diff": diff[:, 0],
},
)
assert 0
@given(seed=st.integers(0, 65535))
@settings(deadline=datetime.timedelta(seconds=10))
def test_slws_fused_8bit_rowwise_all_same(self, seed):
# Comment out for predictable debugging
np.random.seed(seed)
workspace.ResetWorkspace()
n = 1
m = 2
data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
max_segments = 5
max_segment_length = 200
num_lengths = np.random.randint(1, max_segments + 1)
# number of segments to run
lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(
np.int32
)
num_indices = np.sum(lengths)
indices = np.zeros(num_indices, dtype=np.int64)
weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype(
np.float32
)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
)
)
pred_net_onnxified = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=max_segments,
max_seq_size=max_segment_length,
debug=True,
adjust_batch=True,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(pred_net_onnxified)
workspace.CreateNet(ref_net)
workspace.RunNet(pred_net_onnxified.name)
Y_glow = workspace.FetchBlob("Y")
workspace.RunNet(ref_net.name)
Y_c2 = workspace.FetchBlob("Y")
if not np.allclose(Y_c2, Y_glow):
print_test_debug_info(
"slws_fused_8bit_rowwise",
{
"seed": seed,
"indices": indices,
"data": data,
"lengths": lengths,
"weights": weights,
"Y_c2": Y_c2,
"Y_glow": Y_glow,
"diff": Y_glow - Y_c2,
"rowwise_diff": (Y_glow - Y_c2)[:, 0],
},
)
assert 0
@given(
seed=st.integers(0, 65535),
num_rows=st.integers(2, 20),
embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 128]),
batch_size=st.integers(1, 5),
max_weight=st.integers(0, 100),
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
np.random.seed(seed)
workspace.ResetWorkspace()
data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
_indices = []
for length in lengths:
_indices.extend(np.random.choice(np.arange(1, num_rows), length))
indices = np.asarray(_indices).astype(np.int64)
weights = np.random.uniform(
low=0,
high=max_weight,
size=[len(indices)]
).astype(np.float32)
assert(len(weights) < 64000)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
)
)
onnxified_net = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=batch_size,
max_seq_size=np.max(lengths),
debug=True,
adjust_batch=True,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(onnxified_net)
workspace.CreateNet(ref_net)
workspace.RunNet(onnxified_net.name)
Y_glow = workspace.FetchBlob("Y")
workspace.RunNet(ref_net.name)
Y_ref = workspace.FetchBlob("Y")
diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
max_err = np.max(diff, axis=1)
num_offenders = (max_err > 0).sum()
if num_offenders > 0:
print_test_debug_info(
"slws_fused_8bit_rowwise_inv_scale",
{
"seed": seed,
"num_rows": num_rows,
"embedding_dim": embedding_dim,
"batch_size": batch_size,
"max_weight": max_weight,
"indices": indices,
"data": data.shape,
"lengths": lengths,
"weights": weights,
"Y_glow": Y_glow,
"Y_ref": Y_ref,
"diff": diff,
"rowwise_diff": np.max(diff, axis=1),
},
)
assert 0
# Simple test to aid debugging order of operations
# Minimize the case to an SLS that adds two rows
@given(seed=st.integers(0, 65535))
@settings(deadline=datetime.timedelta(seconds=10))
def test_small_sls(self, seed):
np.random.seed(seed)
workspace.ResetWorkspace()
n = 2
DIM = 3
data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
lengths = np.array([n], dtype=np.int32)
indices = np.array(range(n), dtype=np.int64)
weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
)
)
quantized_data = workspace.FetchBlob("quantized_data")
onnxified_net = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=1,
max_seq_size=n,
debug=True,
adjust_batch=True,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(onnxified_net)
workspace.CreateNet(ref_net)
workspace.RunNet(onnxified_net.name)
Y_glow = workspace.FetchBlob("Y")
workspace.RunNet(ref_net.name)
Y_ref = workspace.FetchBlob("Y")
diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
max_err = np.max(diff, axis=1)
num_offenders = (max_err > 0).sum()
if num_offenders > 0:
np.set_printoptions(precision=12)
print(
"ref",
Y_ref.astype(np.float16).astype(np.float32),
"glow",
Y_glow.astype(np.float16).astype(np.float32),
)
print_test_debug_info(
"slws_fused_8bit_rowwise_inv_scale",
{
"seed": seed,
"indices": indices,
"data": data,
"quantized_data": quantized_data,
"lengths": lengths,
"weights": weights,
"Y_glow": Y_glow,
"Y_ref": Y_ref,
"diff": diff,
"rowwise_diff": np.max(diff, axis=1),
},
)
assert 0
@given(seed=st.integers(0, 65535))
@settings(deadline=datetime.timedelta(seconds=10))
def test_sls_layernorm(self, seed):
np.random.seed(seed)
workspace.ResetWorkspace()
n = 2
DIM = 3
data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
lengths = np.array([n], dtype=np.int32)
indices = np.array(range(n), dtype=np.int64)
weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
pred_net.external_output.append("Y_norm")
pred_net.external_output.append("Y_mean")
pred_net.external_output.append("Y_std")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
pred_net.op.add().CopyFrom(
core.CreateOperator(
"LayerNorm",
["Y"],
["Y_norm", "Y_mean", "Y_std"],
epsilon=1e-4,
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
ref_net.external_output.append("Y_norm")
ref_net.external_output.append("Y_mean")
ref_net.external_output.append("Y_std")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net.op.add().CopyFrom(
core.CreateOperator(
"LayerNormFakeFP16NNPI",
["Y"],
["Y_norm", "Y_mean", "Y_std"],
epsilon=1e-4,
axis=1,
elementwise_affine=False
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
)
)
quantized_data = workspace.FetchBlob("quantized_data")
onnxified_net = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=1,
max_seq_size=n,
debug=True,
adjust_batch=True,
use_onnx=False,
)
print("before", pred_net)
print("after", onnxified_net)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(onnxified_net)
workspace.CreateNet(ref_net)
workspace.RunNet(onnxified_net.name)
Y_glow = workspace.FetchBlob("Y_norm")
Y_mean_glow = workspace.FetchBlob("Y_mean")
Y_std_glow = workspace.FetchBlob("Y_std")
workspace.RunNet(ref_net.name)
Y = workspace.FetchBlob("Y")
print("pre normalization", Y)
Y_ref = workspace.FetchBlob("Y_norm")
Y_mean_ref = workspace.FetchBlob("Y_mean")
Y_std_ref = workspace.FetchBlob("Y_std")
# print(Y_ref, Y_glow)
# print(Y_ref.shape, Y_glow.shape)
diff = np.abs(Y_ref - Y_glow)
max_err = np.max(diff, axis=1)
num_offenders = (max_err > 0).sum()
if num_offenders > 0:
np.set_printoptions(precision=12)
print(
"ref",
Y_ref.astype(np.float16).astype(np.float32),
"glow",
Y_glow.astype(np.float16).astype(np.float32),
)
print_test_debug_info(
"slws_fused_8bit_rowwise_inv_scale",
{
"seed": seed,
"indices": indices,
"data": data,
"quantized_data": quantized_data,
"lengths": lengths,
"weights": weights,
"Y_norm_glow": Y_glow,
"Y_norm_ref": Y_ref,
"Y_mean_glow": Y_mean_glow,
"Y_std_glow": Y_std_glow,
"Y_mean_ref": Y_mean_ref,
"Y_std_ref": Y_std_ref,
"diff": diff,
"rowwise_diff": np.max(diff, axis=1),
},
)
assert 0
if __name__ == '__main__':
unittest.main()

View File

@ -1,264 +0,0 @@
import unittest
# Must happen before importing caffe2.python.*
import caffe2.python.fakelowp.init_shared_libs # noqa
import datetime
import numpy as np
from hypothesis import given, settings
from hypothesis import strategies as st
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
from caffe2.python.fakelowp.test_utils import print_test_debug_info
import caffe2.python.serialized_test.serialized_test_util as serial
workspace.GlobalInit(
[
"caffe2",
"--glow_global_fp16=0",
"--glow_global_fused_scale_offset_fp16=0",
"--glow_global_force_sls_fp16_accum=0",
]
)
GLOW_MATMUL_ATOL = 1e-5
GLOW_MATMUL_RTOL = 1e-3
class SparseLengthsSum8BitFakeNNPIFp32Test(serial.SerializedTestCase):
@given(
seed=st.integers(0, 65535),
num_rows=st.integers(2, 20),
embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 128]),
batch_size=st.integers(1, 5),
max_weight=st.integers(0, 100),
)
@settings(deadline=datetime.timedelta(seconds=10))
def test_slws_fused_8bit_rowwise_acc32_nnpi(
self, seed, num_rows, embedding_dim, batch_size, max_weight
):
workspace.GlobalInit(
[
"caffe2",
"--glow_global_fp16=0",
"--glow_global_fused_scale_offset_fp16=0",
"--glow_global_force_sls_fp16_accum=0",
]
)
workspace.ResetWorkspace()
np.random.seed(seed)
data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
_indices = []
for length in lengths:
_indices.extend(np.random.choice(np.arange(1, num_rows), length))
indices = np.asarray(_indices).astype(np.int64)
weights = np.random.uniform(
low=0,
high=max_weight,
size=[len(indices)]
).astype(np.float32)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused8BitRowwiseQuantized",
["data"],
["quantized_data"]
)
)
onnxified_net = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=batch_size,
max_seq_size=np.max(lengths),
debug=True,
adjust_batch=True,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(onnxified_net)
workspace.CreateNet(ref_net)
workspace.RunNet(onnxified_net.name)
Y_glow = workspace.FetchBlob("Y")
workspace.RunNet(ref_net.name)
Y_ref = workspace.FetchBlob("Y")
diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
max_err = np.max(diff, axis=1)
num_offenders = (max_err > 0).sum()
if num_offenders > 0:
print_test_debug_info(
"test_slws_fused_8bit_rowwise_acc32_nnpi",
{
"seed": seed,
"num_rows": num_rows,
"embedding_dim": embedding_dim,
"batch_size": batch_size,
"indices": indices,
"data": data.shape,
"lengths": lengths,
"weights": weights,
"Y_glow": Y_glow,
"Y_ref": Y_ref,
"diff": diff,
"rowwise_diff": np.max(diff, axis=1),
},
)
assert 0
@given(seed=st.integers(0, 65535))
@settings(deadline=datetime.timedelta(seconds=10))
def test_small_sls_acc32(self, seed):
workspace.GlobalInit(
[
"caffe2",
"--glow_global_fp16=0",
"--glow_global_fused_scale_offset_fp16=0",
"--glow_global_force_sls_fp16_accum=0",
]
)
np.random.seed(seed)
workspace.ResetWorkspace()
n = 2
DIM = 3
data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
lengths = np.array([n], dtype=np.int32)
indices = np.array(range(n), dtype=np.int64)
weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
pred_net = caffe2_pb2.NetDef()
pred_net.name = "pred"
pred_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
pred_net.external_output.append("Y")
pred_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwise",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
ref_net = caffe2_pb2.NetDef()
ref_net.name = "ref"
ref_net.external_input.extend(
["quantized_data", "weights", "indices", "lengths"]
)
ref_net.external_output.append("Y")
ref_net.op.add().CopyFrom(
core.CreateOperator(
"SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
["quantized_data", "weights", "indices", "lengths"],
["Y"],
)
)
workspace.FeedBlob("data", data)
workspace.RunOperatorOnce(
core.CreateOperator(
"FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
)
)
quantized_data = workspace.FetchBlob("quantized_data")
onnxified_net = onnxifi_caffe2_net(
pred_net,
{},
max_batch_size=1,
max_seq_size=n,
debug=True,
adjust_batch=True,
use_onnx=False,
)
num_onnxified_ops = sum(
1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
np.testing.assert_equal(num_onnxified_ops, 1)
workspace.FeedBlob("indices", indices)
workspace.FeedBlob("lengths", lengths)
workspace.FeedBlob("weights", weights)
workspace.CreateNet(onnxified_net)
workspace.CreateNet(ref_net)
workspace.RunNet(onnxified_net.name)
Y_glow = workspace.FetchBlob("Y")
workspace.RunNet(ref_net.name)
Y_ref = workspace.FetchBlob("Y")
diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
max_err = np.max(diff, axis=1)
num_offenders = (max_err > 0).sum()
if num_offenders > 0:
np.set_printoptions(precision=12)
print(
"ref",
Y_ref.astype(np.float16).astype(np.float32),
"glow",
Y_glow.astype(np.float16).astype(np.float32),
)
print_test_debug_info(
"test_small_sls_acc32",
{
"seed": seed,
"indices": indices,
"data": data,
"quantized_data": quantized_data,
"lengths": lengths,
"weights": weights,
"Y_glow": Y_glow,
"Y_ref": Y_ref,
"diff": diff,
"rowwise_diff": np.max(diff, axis=1),
},
)
assert 0
if __name__ == '__main__':
unittest.main()

File diff suppressed because it is too large Load Diff

View File

@ -1,74 +0,0 @@
#pragma once
#include <vector>
#include <fbgemm/FbgemmConvert.h>
#include "caffe2/operators/elementwise_ops.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
using namespace std;
template <class Context>
struct ReluFakeFp16Functor {
template <typename T>
bool operator()(const int N, const T* X, T* Y, Context* /* unused */) const {
std::vector<float> X_fp16(N);
fbgemm::RoundToFloat16(
X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
EigenVectorMap<T>(Y, N) =
ConstEigenVectorMap<float>(X_fp16.data(), N).cwiseMax(T(0));
return true;
}
};
template <class Context>
struct SqrFakeFp16Functor {
template <typename T>
bool operator()(const int N, const T* X, T* Y, Context* context) const {
std::vector<float> X_fp16(N);
fbgemm::RoundToFloat16(
X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
math::Sqr(N, X_fp16.data(), Y, context);
fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
};
struct SigmoidFakeIdealFp16Functor {
template <typename T>
bool operator()(const int N, const T* X, T* Y, CPUContext* /* unused */)
const {
std::vector<float> X_fp16(N);
fbgemm::RoundToFloat16(X, X_fp16.data(), N);
EigenVectorArrayMap<T>(Y, N) =
T(1) / (T(1) + (-ConstEigenVectorArrayMap<T>(X_fp16.data(), N)).exp());
fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
};
struct TanhFakeIdealFp16Functor {
template <typename T>
bool operator()(const int N, const T* X, T* Y, CPUContext* context) const {
std::vector<float> X_fp16(N);
fbgemm::RoundToFloat16(
X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
math::Tanh<T, CPUContext>(N, X_fp16.data(), Y, context);
fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
return true;
}
};
} // namespace caffe2
namespace fake_fp16 {
at::Half CalcSigmoidByLUT(at::Half x);
at::Half CalcSwishByLUT(at::Half x);
at::Half CalcSwishByLUTCubic(at::Half x);
at::Half CalcTanhByLUT(at::Half input);
} // namespace fake_fp16

View File

@ -1,33 +0,0 @@
if(USE_GLOO)
set(Caffe2_CONTRIB_GLOO_CPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/allgather_ops.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/barrier_ops.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/context.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE)
if(USE_CUDA)
set(Caffe2_CONTRIB_GLOO_GPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
endif(USE_CUDA)
if(USE_ROCM)
set(Caffe2_CONTRIB_GLOO_HIP_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/hip/allreduce_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/hip/broadcast_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/hip/common_world_ops_gpu.cc"
)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_GLOO_HIP_SRC} PARENT_SCOPE)
set(Caffe2_HIP_INCLUDE ${GLOO_HIP_INCLUDE} ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
endif(USE_ROCM)
endif()

View File

@ -1,64 +0,0 @@
/**
* Copyright (c) 2017-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "allgather_ops.h"
#include <gloo/allgather_ring.h>
namespace caffe2 {
namespace gloo {
template <class Context>
void AllgatherOp<Context>::initializeAlgorithm() {
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::AllgatherRing<float>(
init_.context,
init_.template getInputs<float>(),
init_.template getOutput<float>(),
init_.size));
} else if (init_.template IsType<long>()) {
algorithm_.reset(new ::gloo::AllgatherRing<long>(
init_.context,
init_.template getInputs<long>(),
init_.template getOutput<long>(),
init_.size));
} else if (init_.template IsType<int>()) {
algorithm_.reset(new ::gloo::AllgatherRing<int>(
init_.context,
init_.template getInputs<int>(),
init_.template getOutput<int>(),
init_.size));
} else if (init_.template IsType<at::Half>()) {
algorithm_.reset(new ::gloo::AllgatherRing<::gloo::float16>(
init_.context,
init_.template getInputs<::gloo::float16>(),
init_.template getOutput<::gloo::float16>(),
init_.size));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
// Used outside of the translation unit
template void AllgatherOp<CPUContext>::initializeAlgorithm();
namespace {
REGISTER_CPU_OPERATOR_WITH_ENGINE(Allgather, GLOO, AllgatherOp<CPUContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,130 +0,0 @@
/**
* Copyright (c) 2017-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <algorithm>
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"
#include <gloo/algorithm.h>
#include <gloo/common/error.h>
#include <gloo/context.h>
namespace caffe2 {
namespace gloo {
template <class Context>
class AllgatherOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
AllgatherOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
ws_(ws),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
}
~AllgatherOp() override {}
bool RunOnDevice() override {
std::call_once(once_, [&] { initialize(); });
// If any parameter has changed in between runs, the initialized
// algorithm is invalid and cannot be used.
update(current_);
CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
try {
algorithm_->run();
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ioe);
return false;
} else {
throw;
}
}
return true;
}
protected:
void initialize() {
// Allocate output tensor
CAFFE_ENFORCE_EQ(OutputSize(), 1);
auto comm_size =
OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0)->size;
const auto dims = std::vector<int64_t>(
1, (InputSize() - 1) * Input(1).numel() * comm_size);
Output(0)->Resize(dims);
// Store which inputs/outputs this instance initialized with
update(init_);
CAFFE_ENFORCE_EQ(init_.outputs.size(), 1);
// Verify tensors all have same size
size_t size = Input(1).numel();
for (const auto i : c10::irange(2, InputSize())) {
CAFFE_ENFORCE_EQ(Input(i).numel(), size);
}
// Verify tensors all have same type
TypeMeta meta = Input(1).dtype();
for (const auto i : c10::irange(2, InputSize())) {
CAFFE_ENFORCE(Input(i).dtype() == meta);
}
// Finally initialize the algorithm
initializeAlgorithm();
}
void initializeAlgorithm();
std::once_flag once_;
std::unique_ptr<::gloo::Algorithm> algorithm_;
// Captures the parameters passed to Gloo when first initialized.
// An instance is updated every time this op runs and is compared
// to the reference instance for equality. If any parameter has
// changed from run to run, the initialized algorithm is invalid.
void update(GlooParameters& params) {
params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
params.inputs.resize(InputSize() - 1);
params.size = Input(1).numel();
params.meta = Input(1).dtype();
for (const auto i : c10::irange(params.inputs.size())) {
params.inputs[i] = Input(i + 1).raw_data();
}
params.outputs.resize(OutputSize());
params.outputs[0] = Output(0)->raw_mutable_data(params.meta);
}
GlooParameters init_;
GlooParameters current_;
Workspace* ws_;
std::string status_blob_;
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,123 +0,0 @@
#include "allreduce_ops.h"
#include <math.h>
#include <gloo/allreduce_bcube.h>
#include <gloo/allreduce_halving_doubling.h>
#include <gloo/allreduce_ring.h>
#include <gloo/allreduce_ring_chunked.h>
#include <gloo/types.h>
namespace {
/**
* This is a helper function which attempts to get a base value depending on the
* # of nodes. Larger the base the better performance (up to 4) is what we have
* observed in gloo benchmarks. At the moment bcube works only if # nodes = base
* ^ x. Where x is some constant. So, if # node don't match our expectation
* simply return -1. This will indicate caller to switch to another algorithm
* like halving-doubling.
*/
static int getAllrduceBcubeBase(int nodes) {
auto getExponent = [](int n, int b) -> int {
float lg2n = log2(n);
float lg2b = log2(b);
return ceil(lg2n / lg2b);
};
auto baseCheck = [&](int n, int b) -> bool {
int e = getExponent(n, b);
return n == pow(b, e);
};
for (const auto base : {6, 5, 4, 3, 2}) {
if (baseCheck(nodes, base)) {
return base;
}
/*
* Base could work if # nodes is multiple of the base yet smaller than
* base^2
*/
if (nodes < base * base && 0 == nodes % base) {
return base;
}
}
return -1;
}
} // namespace
namespace caffe2 {
namespace gloo {
template <class Context>
void AllreduceOp<Context>::initializeBcube() {
int base = getAllrduceBcubeBase(init_.size);
if (-1 == base) {
return initializeHalvingDoubling();
}
init_.context->base = base;
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::AllreduceBcube<float>(
init_.context, init_.template getOutputs<float>(), init_.size));
} else if (init_.template IsType<::at::Half>()) {
algorithm_.reset(new ::gloo::AllreduceBcube<::gloo::float16>(
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
template <class Context>
void AllreduceOp<Context>::initializeHalvingDoubling() {
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<float>(
init_.context, init_.template getOutputs<float>(), init_.size));
} else if (init_.template IsType<::at::Half>()) {
algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<::gloo::float16>(
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
// Used outside of the translation unit
template void AllreduceOp<CPUContext>::initializeHalvingDoubling();
template <class Context>
void AllreduceOp<Context>::initializeRingFull() {
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::AllreduceRing<float>(
init_.context, init_.template getOutputs<float>(), init_.size));
} else if (init_.template IsType<::at::Half>()) {
algorithm_.reset(new ::gloo::AllreduceRing<::gloo::float16>(
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
template <class Context>
void AllreduceOp<Context>::initializeRingChunked() {
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::AllreduceRingChunked<float>(
init_.context, init_.template getOutputs<float>(), init_.size));
} else if (init_.template IsType<::at::Half>()) {
algorithm_.reset(new ::gloo::AllreduceRingChunked<::gloo::float16>(
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
namespace {
REGISTER_CPU_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CPUContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,134 +0,0 @@
#pragma once
#include <algorithm>
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include <gloo/algorithm.h>
#include <gloo/common/error.h>
#include <gloo/context.h>
namespace caffe2 {
namespace gloo {
template <class Context>
class AllreduceOp final : public Operator<Context> {
enum Mode { RING_FULL, RING_CHUNKED, HALVING_DOUBLING, BCUBE };
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
AllreduceOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
ws_(ws),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
gpu_direct_(
OperatorBase::GetSingleArgument<bool>("gpu_direct", false)) {
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
}
~AllreduceOp() override {}
bool RunOnDevice() override {
std::call_once(once_, [&] { initialize(); });
// If any parameter has changed in between runs, the initialized
// algorithm is invalid and cannot be used.
update(current_);
CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
try {
algorithm_->run();
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ioe);
return false;
} else {
throw;
}
}
return true;
}
protected:
void initialize() {
Mode mode = HALVING_DOUBLING;
// Store which inputs/outputs this instance initialized with
update(init_);
// Verify inputs == outputs
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
for (const auto i : c10::irange(0U, init_.inputs.size())) {
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
}
// Verify tensors all have same size
auto size = Input(1).numel();
for (const auto i : c10::irange(2, InputSize())) {
CAFFE_ENFORCE_EQ(Input(i).numel(), size);
}
// Verify tensors all have same type
TypeMeta meta = Input(1).dtype();
for (const auto i : c10::irange(2, InputSize())) {
CAFFE_ENFORCE(Input(i).dtype() == meta);
}
switch (mode) {
case RING_FULL:
initializeRingFull();
return;
case RING_CHUNKED:
initializeRingChunked();
return;
case HALVING_DOUBLING:
initializeHalvingDoubling();
return;
case BCUBE:
initializeBcube();
return;
}
CAFFE_ENFORCE(false, "Unreachable code");
}
void initializeBcube();
void initializeHalvingDoubling();
void initializeRingFull();
void initializeRingChunked();
std::once_flag once_;
std::unique_ptr<::gloo::Algorithm> algorithm_;
// Captures the parameters passed to Gloo when first initialized.
// An instance is updated every time this op runs and is compared
// to the reference instance for equality. If any parameter has
// changed from run to run, the initialized algorithm is invalid.
void update(GlooParameters& params) {
params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
params.inputs.resize(InputSize() - 1);
params.outputs.resize(OutputSize());
for (const auto i : c10::irange(0U, params.inputs.size())) {
params.inputs[i] = Input(i + 1).raw_data();
params.outputs[i] = Output(i)->raw_mutable_data();
}
params.size = Output(0)->numel();
params.meta = Output(0)->dtype();
}
GlooParameters init_;
GlooParameters current_;
Workspace* ws_;
std::string status_blob_;
const bool gpu_direct_;
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,168 +0,0 @@
#include "caffe2/contrib/gloo/allreduce_ops.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/logging.h"
#include <gloo/cuda_allreduce_bcube.h>
#include <gloo/cuda_allreduce_halving_doubling.h>
#include <gloo/cuda_allreduce_ring.h>
#include <gloo/cuda_allreduce_ring_chunked.h>
#include <gloo/types.h>
namespace caffe2 {
namespace gloo {
namespace {
// Decides on using GPUDirect based on device support.
template <template <typename T, typename W> class A, typename T>
std::unique_ptr<::gloo::Algorithm> initializeAlgorithm(
bool gpu_direct_,
std::shared_ptr<::gloo::Context> context,
std::vector<T*> ptrs,
size_t size) {
if (gpu_direct_) {
if (context->getDevice()->hasGPUDirect()) {
return std::unique_ptr<::gloo::Algorithm>(
new A<T, ::gloo::CudaDeviceWorkspace<T>>(context, ptrs, size));
} else {
LOG(WARNING)
<< "GPUDirect not available; "
<< "Gloo communication will go through system memory instead.";
}
}
return std::unique_ptr<::gloo::Algorithm>(
new A<T, ::gloo::CudaHostWorkspace<T>>(context, ptrs, size));
}
/**
* This is a helper function which attempts to get a base value depending on the
* # of nodes. Larger the base the better performance (up to 4) is what we have
* observed in gloo benchmarks. At the moment bcube works only if # nodes = base
* ^ x. Where x is some constant. So, if # node don't match our expectation
* simply return -1. This will indicate caller to switch to another algorithm
* like halving-doubling.
*/
static int getAllrduceBcubeBase(int nodes) {
auto getExponent = [](int n, int b) -> int {
float lg2n = log2(n);
float lg2b = log2(b);
return ceil(lg2n / lg2b);
};
auto baseCheck = [&](int n, int b) -> bool {
int e = getExponent(n, b);
return n == pow(b, e);
};
for (const auto base : {6, 5, 4, 3, 2}) {
if (baseCheck(nodes, base)) {
return base;
}
/*
* Base could work if # nodes is multiple of the base yet smaller than
* base^2
*/
if (nodes < base * base && 0 == nodes % base) {
return base;
}
}
return -1;
}
} // namespace
template <class Context>
void AllreduceOp<Context>::initializeBcube() {
int base = getAllrduceBcubeBase(init_.size);
if (-1 == base) {
return initializeHalvingDoubling();
}
init_.context->base = base;
if (init_.template IsType<float>()) {
algorithm_ = initializeAlgorithm<::gloo::CudaAllreduceBcube, float>(
gpu_direct_,
init_.context,
init_.template getOutputs<float>(),
init_.size);
} else if (init_.template IsType<at::Half>()) {
algorithm_ =
initializeAlgorithm<::gloo::CudaAllreduceBcube, ::gloo::float16>(
gpu_direct_,
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size);
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
template <class Context>
void AllreduceOp<Context>::initializeHalvingDoubling() {
if (init_.template IsType<float>()) {
algorithm_ =
initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, float>(
gpu_direct_,
init_.context,
init_.template getOutputs<float>(),
init_.size);
} else if (init_.template IsType<at::Half>()) {
algorithm_ =
initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, ::gloo::float16>(
gpu_direct_,
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size);
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
template <class Context>
void AllreduceOp<Context>::initializeRingFull() {
if (init_.template IsType<float>()) {
algorithm_ =
initializeAlgorithm<::gloo::CudaAllreduceRing, float>(
gpu_direct_,
init_.context,
init_.template getOutputs<float>(),
init_.size);
} else if (init_.template IsType<at::Half>()) {
algorithm_ =
initializeAlgorithm<::gloo::CudaAllreduceRing, ::gloo::float16>(
gpu_direct_,
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size);
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
template <class Context>
void AllreduceOp<Context>::initializeRingChunked() {
if (init_.template IsType<float>()) {
algorithm_ =
initializeAlgorithm<::gloo::CudaAllreduceRingChunked, float>(
gpu_direct_,
init_.context,
init_.template getOutputs<float>(),
init_.size);
} else if (init_.template IsType<at::Half>()) {
algorithm_ =
initializeAlgorithm<::gloo::CudaAllreduceRingChunked, ::gloo::float16>(
gpu_direct_,
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size);
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
namespace {
REGISTER_CUDA_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CUDAContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,11 +0,0 @@
#include "barrier_ops.h"
namespace caffe2 {
namespace gloo {
namespace {
REGISTER_CPU_OPERATOR_WITH_ENGINE(Barrier, GLOO, BarrierOp<CPUContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,63 +0,0 @@
#pragma once
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/core/operator.h"
#include <gloo/algorithm.h>
#include <gloo/barrier_all_to_one.h>
#include <gloo/common/error.h>
#include <gloo/context.h>
namespace caffe2 {
namespace gloo {
template <class Context>
class BarrierOp final : public Operator<Context> {
public:
BarrierOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
ws_(ws),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
}
~BarrierOp() override {}
bool RunOnDevice() override {
auto context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
std::call_once(once_, [&] {
initContext_ = context;
// Use an all-to-one barrier synchronizing against rank 0
algorithm_.reset(new ::gloo::BarrierAllToOne(initContext_, 0));
});
// If any parameter has changed in between runs, the initialized
// algorithm is invalid and cannot be used.
CAFFE_ENFORCE(context == initContext_, "Context has changed");
try {
algorithm_->run();
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ioe);
return false;
} else {
throw;
}
}
return true;
}
protected:
std::once_flag once_;
std::shared_ptr<::gloo::Context> initContext_;
std::unique_ptr<::gloo::Algorithm> algorithm_;
Workspace* ws_;
std::string status_blob_;
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,36 +0,0 @@
#include "broadcast_ops.h"
#include <gloo/broadcast_one_to_all.h>
namespace caffe2 {
namespace gloo {
template <class Context>
void BroadcastOp<Context>::initializeAlgorithm() {
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::BroadcastOneToAll<float>(
init_.context, init_.template getOutputs<float>(), init_.size, root_));
} else if (init_.template IsType<long>()) {
algorithm_.reset(new ::gloo::BroadcastOneToAll<long>(
init_.context, init_.template getOutputs<long>(), init_.size, root_));
} else if (init_.template IsType<int>()) {
algorithm_.reset(new ::gloo::BroadcastOneToAll<int>(
init_.context, init_.template getOutputs<int>(), init_.size, root_));
} else if (init_.template IsType<at::Half>()) {
algorithm_.reset(new ::gloo::BroadcastOneToAll<::gloo::float16>(
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size,
root_));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
namespace {
REGISTER_CPU_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CPUContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,112 +0,0 @@
#pragma once
#include <algorithm>
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"
#include <gloo/algorithm.h>
#include <gloo/common/error.h>
#include <gloo/context.h>
namespace caffe2 {
namespace gloo {
template <class Context>
class BroadcastOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
BroadcastOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
root_(OperatorBase::template GetSingleArgument<int>("root", 0)),
ws_(ws),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
}
~BroadcastOp() override {}
bool RunOnDevice() override {
std::call_once(once_, [&] { initialize(); });
// If any parameter has changed in between runs, the initialized
// algorithm is invalid and cannot be used.
update(current_);
CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
try {
algorithm_->run();
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ioe);
return false;
} else {
throw;
}
}
return true;
}
protected:
void initialize() {
// Store which inputs/outputs this instance initialized with
update(init_);
// Verify inputs == outputs
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
for (const auto i : c10::irange(init_.inputs.size())) {
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
}
// Verify tensors all have same size
size_t size = Input(1).numel();
for (const auto i : c10::irange(2, InputSize())) {
CAFFE_ENFORCE_EQ(Input(i).numel(), size);
}
// Verify tensors all have same size
TypeMeta meta = Input(1).dtype();
for (const auto i : c10::irange(2, InputSize())) {
CAFFE_ENFORCE(Input(i).dtype() == meta);
}
// Finally initialize the algorithm
initializeAlgorithm();
}
void initializeAlgorithm();
const int root_;
std::once_flag once_;
std::unique_ptr<::gloo::Algorithm> algorithm_;
// Captures the parameters passed to Gloo when first initialized.
// An instance is updated every time this op runs and is compared
// to the reference instance for equality. If any parameter has
// changed from run to run, the initialized algorithm is invalid.
void update(GlooParameters& params) {
params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
params.inputs.resize(InputSize() - 1);
params.outputs.resize(OutputSize());
for (const auto i : c10::irange(params.inputs.size())) {
params.inputs[i] = Input(i + 1).raw_data();
params.outputs[i] = Output(i)->raw_mutable_data();
}
params.size = Output(0)->numel();
params.meta = Output(0)->dtype();
}
GlooParameters init_;
GlooParameters current_;
Workspace* ws_;
std::string status_blob_;
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,38 +0,0 @@
#include "caffe2/contrib/gloo/broadcast_ops.h"
#include "caffe2/core/context_gpu.h"
#include <gloo/cuda_broadcast_one_to_all.h>
namespace caffe2 {
namespace gloo {
template <class Context>
void BroadcastOp<Context>::initializeAlgorithm() {
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<float>(
init_.context, init_.template getOutputs<float>(), init_.size, root_));
} else if (init_.template IsType<long>()) {
algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<long>(
init_.context, init_.template getOutputs<long>(), init_.size, root_));
} else if (init_.template IsType<int>()) {
algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<int>(
init_.context, init_.template getOutputs<int>(), init_.size, root_));
} else if (init_.template IsType<at::Half>()) {
algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<::gloo::float16>(
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size,
root_));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
namespace {
REGISTER_CUDA_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CUDAContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,48 +0,0 @@
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/tensor.h"
#include <gloo/transport/tcp/device.h>
#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
#include <gloo/transport/ibverbs/device.h>
#endif
namespace caffe2 {
namespace gloo {
void signalFailure(Blob* status_blob, std::exception& /* unused */) {
auto* res = BlobGetMutableTensor(status_blob, CPU);
res->Resize(1);
res->template mutable_data<int32_t>()[0] = 1;
}
std::shared_ptr<::gloo::transport::Device> createDevice(
const createDeviceAttr attr) {
if (attr.transport == "tcp") {
::gloo::transport::tcp::attr tcpAttr;
if (attr.interface.size() > 0) {
tcpAttr.iface = attr.interface;
}
return ::gloo::transport::tcp::CreateDevice(tcpAttr);
} else if (attr.transport == "ibverbs") {
#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
::gloo::transport::ibverbs::attr ibverbsAttr;
ibverbsAttr.port = 1;
ibverbsAttr.index = 0;
if (attr.interface.size() > 0) {
ibverbsAttr.name = attr.interface;
}
return ::gloo::transport::ibverbs::CreateDevice(ibverbsAttr);
#else
CAFFE_THROW(
"Gloo was not compiled with ibverbs support. ",
"Please recompile with -DUSE_IBVERBS=1.");
#endif
}
CAFFE_THROW("Invalid transport: ", attr.transport);
}
} // namespace gloo
} // namespace caffe2

View File

@ -1,73 +0,0 @@
#pragma once
#include <exception>
#include "caffe2/core/blob.h"
#include <gloo/config.h>
#include <gloo/context.h>
#include <gloo/transport/device.h>
namespace caffe2 {
namespace gloo {
TORCH_API void signalFailure(Blob* status_blob, std::exception& exception);
struct createDeviceAttr {
// "tcp" or "ibverbs"
std::string transport;
// E.g. "eth0" (tcp), or "mlx5_0" (ibverbs).
// This may be empty to make Gloo figure it out.
std::string interface;
};
TORCH_API std::shared_ptr<::gloo::transport::Device> createDevice(
const createDeviceAttr attr);
// Captures the parameters passed to Gloo.
struct GlooParameters {
std::shared_ptr<::gloo::Context> context;
std::vector<const void*> inputs;
std::vector<void*> outputs;
size_t size;
TypeMeta meta;
template <typename T>
std::vector<const T*> getInputs() {
std::vector<const T*> result;
result.reserve(inputs.size());
for (auto& input : inputs) {
result.push_back(reinterpret_cast<const T*>(input));
}
return result;
}
template <typename T>
std::vector<T*> getOutputs() {
std::vector<T*> result;
result.reserve(outputs.size());
for (auto& output : outputs) {
result.push_back(reinterpret_cast<T*>(output));
}
return result;
}
template <typename T>
T* getOutput() {
return reinterpret_cast<T*>(outputs[0]);
}
template <typename T>
bool IsType() const {
return meta.Match<T>();
}
bool operator==(GlooParameters const& other) const {
return context == other.context && inputs == other.inputs &&
outputs == other.outputs && size == other.size;
}
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,29 +0,0 @@
#include "caffe2/contrib/gloo/common_world_ops.h"
#include <gloo/transport/tcp/device.h>
namespace caffe2 {
namespace gloo {
template <>
void CreateCommonWorld<CPUContext>::initializeForContext() {
// Nothing to initialize for CPUContext.
}
namespace {
REGISTER_CPU_OPERATOR_WITH_ENGINE(
CreateCommonWorld,
GLOO,
CreateCommonWorld<CPUContext>);
REGISTER_CPU_OPERATOR_WITH_ENGINE(
CloneCommonWorld,
GLOO,
CloneCommonWorld<CPUContext>);
REGISTER_CPU_OPERATOR_WITH_ENGINE(DestroyCommonWorld, GLOO, DestroyCommonWorld);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,249 +0,0 @@
#pragma once
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/contrib/gloo/store_handler.h"
#include "caffe2/core/operator.h"
#include "caffe2/distributed/store_handler.h"
#include <gloo/common/error.h>
#include <gloo/config.h>
#include <gloo/rendezvous/context.h>
#include <gloo/rendezvous/prefix_store.h>
#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
#include <gloo/mpi/context.h>
#endif
namespace caffe2 {
namespace gloo {
template <class Context>
class CreateCommonWorld final : public Operator<Context> {
public:
using CommonWorld = std::shared_ptr<::gloo::Context>;
USE_OPERATOR_CONTEXT_FUNCTIONS;
CreateCommonWorld(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
size_(OperatorBase::template GetSingleArgument<int>("size", 0)),
rank_(OperatorBase::template GetSingleArgument<int>("rank", 0)),
sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
transport_(OperatorBase::template GetSingleArgument<std::string>(
"transport", "tcp")),
interface_(OperatorBase::template GetSingleArgument<std::string>(
"interface", "")),
mpi_rendezvous_(OperatorBase::template GetSingleArgument<bool>(
"mpi_rendezvous", false)),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
timeout_ms_(OperatorBase::GetSingleArgument<int>("timeout_ms", -1)),
ws_(ws) {
CAFFE_ENFORCE(
operator_def.has_name(), "CreateCommonWorld operator requires name");
CAFFE_ENFORCE(rank_ >= 0 && rank_ < size_);
name_ = operator_def.name();
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
initialize();
}
~CreateCommonWorld() override {
}
CommonWorld rendezvousWithMPI() {
#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
auto context = ::gloo::mpi::Context::createManaged();
if (timeout_ms_ != -1) {
context->setTimeout(std::chrono::milliseconds(timeout_ms_));
}
context->connectFullMesh(device_);
return context;
#else
CAFFE_THROW(
"Gloo was not compiled with MPI support. ",
"Please recompile with -DUSE_MPI=1.");
#endif
}
CommonWorld rendezvousWithStore(
const std::unique_ptr<StoreHandler>& handler) {
// Use PrefixStore to isolate different CreateCommonWorld instances
StoreHandlerWrapper wrapper(*handler);
::gloo::rendezvous::PrefixStore store(name_, wrapper);
auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
if (timeout_ms_ != -1) {
context->setTimeout(std::chrono::milliseconds(timeout_ms_));
}
context->connectFullMesh(store, device_);
return context;
}
bool RunOnDevice() override {
try {
CommonWorld context;
if (mpi_rendezvous_) {
context = rendezvousWithMPI();
} else {
CAFFE_ENFORCE_EQ(InputSize(), 1, "Expected store handler input");
const auto& handler =
OperatorBase::Input<std::unique_ptr<StoreHandler>>(STORE_HANDLER);
context = rendezvousWithStore(handler);
}
// Switch pairs to synchronous mode if configured to do so
if (sync_) {
for (int i = 0; i < context->size; i++) {
auto& pair = context->getPair(i);
if (pair) {
pair->setSync(true, false);
}
}
}
*OperatorBase::Output<CommonWorld>(COMM) = std::move(context);
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
return handleException(ioe);
} catch (::caffe2::StoreHandlerTimeoutException& te) {
LOG(ERROR) << "Caught store handler timeout exception: " << te.what();
return handleException(te);
}
return true;
}
private:
bool handleException(std::exception& ex) {
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ex);
return false;
} else {
throw;
}
}
void initialize() {
// Share single device between all common worlds.
static std::once_flag once;
static std::shared_ptr<::gloo::transport::Device> device;
std::call_once(once, [&]() {
createDeviceAttr attr;
attr.transport = transport_;
attr.interface = interface_;
device = createDevice(attr);
});
device_ = device;
// Context specific initialization.
initializeForContext();
}
void initializeForContext();
const int size_;
const int rank_;
const bool sync_;
const std::string transport_;
const std::string interface_;
const bool mpi_rendezvous_;
const std::string status_blob_;
const int timeout_ms_;
Workspace* ws_;
std::string name_;
std::shared_ptr<::gloo::transport::Device> device_;
INPUT_TAGS(STORE_HANDLER);
OUTPUT_TAGS(COMM);
};
template <class Context>
class CloneCommonWorld final : public Operator<Context> {
public:
using CommonWorld = std::shared_ptr<::gloo::Context>;
USE_OPERATOR_CONTEXT_FUNCTIONS;
CloneCommonWorld(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
ws_(ws),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
}
~CloneCommonWorld() override {}
bool RunOnDevice() override {
try {
auto existing = OperatorBase::Input<CommonWorld>(EXISTING_COMM);
::gloo::rendezvous::ContextFactory factory(existing);
auto clone = factory.makeContext(existing->getDevice());
// Switch pairs to synchronous mode if configured to do so
if (sync_) {
for (int i = 0; i < clone->size; i++) {
auto& pair = clone->getPair(i);
if (pair) {
pair->setSync(true, false);
}
}
}
*OperatorBase::Output<CommonWorld>(CLONED_COMM) = std::move(clone);
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
return handleException(ioe);
}
return true;
}
private:
bool handleException(std::exception& ex) {
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ex);
return false;
} else {
throw;
}
}
const bool sync_;
Workspace* ws_;
std::string status_blob_;
INPUT_TAGS(EXISTING_COMM);
OUTPUT_TAGS(CLONED_COMM);
};
class DestroyCommonWorld final : public Operator<CPUContext> {
public:
DestroyCommonWorld(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
cw_name_ = operator_def.input(0);
}
bool RunOnDevice() override {
if (OperatorBase::InputBlob(0).GetRaw() == nullptr) {
return true;
}
const auto& context =
OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
if (context) {
LOG(INFO) << "Closing connections: " << cw_name_;
context->closeConnections();
}
return true;
}
private:
std::string cw_name_;
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,35 +0,0 @@
#include "caffe2/contrib/gloo/common_world_ops.h"
#include "caffe2/core/context_gpu.h"
#include <gloo/cuda.h>
#include <gloo/transport/tcp/device.h>
namespace caffe2 {
namespace gloo {
template <>
void CreateCommonWorld<CUDAContext>::initializeForContext() {
static std::once_flag once;
std::call_once(once, [&]() {
// This is the first time we call Gloo code for a CUDAContext.
// Share Caffe2 CUDA mutex with Gloo.
::gloo::CudaShared::setMutex(&CUDAContext::mutex());
});
}
namespace {
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
CreateCommonWorld,
GLOO,
CreateCommonWorld<CUDAContext>);
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
CloneCommonWorld,
GLOO,
CloneCommonWorld<CUDAContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,12 +0,0 @@
#include "context.h"
#include <c10/util/typeid.h>
#include <gloo/types.h>
namespace caffe2 {
CAFFE_KNOWN_TYPE(::gloo::float16);
CAFFE_KNOWN_TYPE(std::shared_ptr<::gloo::Context>);
} // namespace caffe2

View File

@ -1,3 +0,0 @@
#pragma once
#include <gloo/context.h>

View File

@ -1,706 +0,0 @@
#!/usr/bin/env python3
from hypothesis import given, settings
import hypothesis.strategies as st
from multiprocessing import Process, Queue
import numpy as np
import os
import pickle
import tempfile
import shutil
from caffe2.python import core, workspace, dyndep
import caffe2.python.hypothesis_test_util as hu
from gloo.python import IoError
dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:redis_store_handler_ops")
dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops")
dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops_gpu")
op_engine = 'GLOO'
class TemporaryDirectory:
def __enter__(self):
self.tmpdir = tempfile.mkdtemp()
return self.tmpdir
def __exit__(self, type, value, traceback):
shutil.rmtree(self.tmpdir)
class TestCase(hu.HypothesisTestCase):
test_counter = 0
sync_counter = 0
def run_test_locally(self, fn, device_option=None, **kwargs):
# Queue for assertion errors on subprocesses
queue = Queue()
# Capture any exception thrown by the subprocess
def run_fn(*args, **kwargs):
try:
with core.DeviceScope(device_option):
fn(*args, **kwargs)
workspace.ResetWorkspace()
queue.put(True)
except Exception as ex:
queue.put(ex)
# Start N processes in the background
procs = []
for i in range(kwargs['comm_size']):
kwargs['comm_rank'] = i
proc = Process(
target=run_fn,
kwargs=kwargs)
proc.start()
procs.append(proc)
# Test complete, join background processes
while len(procs) > 0:
proc = procs.pop(0)
while proc.is_alive():
proc.join(10)
# Raise exception if we find any. Otherwise each worker
# should put a True into the queue
# Note that the following is executed ALSO after
# the last process was joined, so if ANY exception
# was raised, it will be re-raised here.
self.assertFalse(queue.empty(), "Job failed without a result")
o = queue.get()
if isinstance(o, Exception):
raise o
else:
self.assertTrue(o)
def run_test_distributed(self, fn, device_option=None, **kwargs):
comm_rank = os.getenv('COMM_RANK')
self.assertIsNotNone(comm_rank)
comm_size = os.getenv('COMM_SIZE')
self.assertIsNotNone(comm_size)
kwargs['comm_rank'] = int(comm_rank)
kwargs['comm_size'] = int(comm_size)
with core.DeviceScope(device_option):
fn(**kwargs)
workspace.ResetWorkspace()
def create_common_world(self, comm_rank, comm_size, tmpdir=None, existing_cw=None):
store_handler = "store_handler"
# If REDIS_HOST is set, use RedisStoreHandler for rendezvous.
if existing_cw is None:
redis_host = os.getenv("REDIS_HOST")
redis_port = int(os.getenv("REDIS_PORT", 6379))
if redis_host is not None:
workspace.RunOperatorOnce(
core.CreateOperator(
"RedisStoreHandlerCreate",
[],
[store_handler],
prefix=str(TestCase.test_counter) + "/",
host=redis_host,
port=redis_port))
else:
workspace.RunOperatorOnce(
core.CreateOperator(
"FileStoreHandlerCreate",
[],
[store_handler],
path=tmpdir))
common_world = "common_world"
else:
common_world = str(existing_cw) + ".forked"
if existing_cw is not None:
workspace.RunOperatorOnce(
core.CreateOperator(
"CloneCommonWorld",
[existing_cw],
[common_world],
sync=True,
engine=op_engine))
else:
workspace.RunOperatorOnce(
core.CreateOperator(
"CreateCommonWorld",
[store_handler],
[common_world],
size=comm_size,
rank=comm_rank,
sync=True,
engine=op_engine))
return (store_handler, common_world)
def synchronize(self, store_handler, value, comm_rank=None):
TestCase.sync_counter += 1
blob = "sync_{}".format(TestCase.sync_counter)
if comm_rank == 0:
workspace.FeedBlob(blob, pickle.dumps(value))
workspace.RunOperatorOnce(
core.CreateOperator(
"StoreSet",
[store_handler, blob],
[]))
else:
workspace.RunOperatorOnce(
core.CreateOperator(
"StoreGet",
[store_handler],
[blob]))
return pickle.loads(workspace.FetchBlob(blob))
def _test_broadcast(self,
comm_rank=None,
comm_size=None,
blob_size=None,
num_blobs=None,
tmpdir=None,
use_float16=False,
):
store_handler, common_world = self.create_common_world(
comm_rank=comm_rank,
comm_size=comm_size,
tmpdir=tmpdir)
blob_size = self.synchronize(
store_handler,
blob_size,
comm_rank=comm_rank)
num_blobs = self.synchronize(
store_handler,
num_blobs,
comm_rank=comm_rank)
for i in range(comm_size):
blobs = []
for j in range(num_blobs):
blob = "blob_{}".format(j)
offset = (comm_rank * num_blobs) + j
value = np.full(blob_size, offset,
np.float16 if use_float16 else np.float32)
workspace.FeedBlob(blob, value)
blobs.append(blob)
net = core.Net("broadcast")
net.Broadcast(
[common_world] + blobs,
blobs,
root=i,
engine=op_engine)
workspace.CreateNet(net)
workspace.RunNet(net.Name())
for j in range(num_blobs):
np.testing.assert_array_equal(
workspace.FetchBlob(blobs[j]),
i * num_blobs)
# Run the net a few more times to check the operator
# works not just the first time it's called
for _tmp in range(4):
workspace.RunNet(net.Name())
@given(comm_size=st.integers(min_value=2, max_value=8),
blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
num_blobs=st.integers(min_value=1, max_value=4),
device_option=st.sampled_from([hu.cpu_do]),
use_float16=st.booleans())
@settings(deadline=10000)
def test_broadcast(self, comm_size, blob_size, num_blobs, device_option,
use_float16):
TestCase.test_counter += 1
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_broadcast,
blob_size=blob_size,
num_blobs=num_blobs,
use_float16=use_float16,
device_option=device_option)
else:
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_broadcast,
comm_size=comm_size,
blob_size=blob_size,
num_blobs=num_blobs,
device_option=device_option,
tmpdir=tmpdir,
use_float16=use_float16)
def _test_allreduce(self,
comm_rank=None,
comm_size=None,
blob_size=None,
num_blobs=None,
tmpdir=None,
use_float16=False
):
store_handler, common_world = self.create_common_world(
comm_rank=comm_rank,
comm_size=comm_size,
tmpdir=tmpdir)
blob_size = self.synchronize(
store_handler,
blob_size,
comm_rank=comm_rank)
num_blobs = self.synchronize(
store_handler,
num_blobs,
comm_rank=comm_rank)
blobs = []
for i in range(num_blobs):
blob = "blob_{}".format(i)
value = np.full(blob_size, (comm_rank * num_blobs) + i,
np.float16 if use_float16 else np.float32)
workspace.FeedBlob(blob, value)
blobs.append(blob)
net = core.Net("allreduce")
net.Allreduce(
[common_world] + blobs,
blobs,
engine=op_engine)
workspace.CreateNet(net)
workspace.RunNet(net.Name())
for i in range(num_blobs):
np.testing.assert_array_equal(
workspace.FetchBlob(blobs[i]),
(num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
# Run the net a few more times to check the operator
# works not just the first time it's called
for _tmp in range(4):
workspace.RunNet(net.Name())
def _test_allreduce_multicw(self,
comm_rank=None,
comm_size=None,
tmpdir=None
):
_store_handler, common_world = self.create_common_world(
comm_rank=comm_rank,
comm_size=comm_size,
tmpdir=tmpdir)
_, common_world2 = self.create_common_world(
comm_rank=comm_rank,
comm_size=comm_size,
tmpdir=tmpdir,
existing_cw=common_world)
blob_size = int(1e4)
num_blobs = 4
for cw in [common_world, common_world2]:
blobs = []
for i in range(num_blobs):
blob = "blob_{}".format(i)
value = np.full(blob_size, (comm_rank * num_blobs) + i, np.float32)
workspace.FeedBlob(blob, value)
blobs.append(blob)
net = core.Net("allreduce_multicw")
net.Allreduce(
[cw] + blobs,
blobs,
engine=op_engine)
workspace.RunNetOnce(net)
for i in range(num_blobs):
np.testing.assert_array_equal(
workspace.FetchBlob(blobs[i]),
(num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
@given(comm_size=st.integers(min_value=2, max_value=8),
blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
num_blobs=st.integers(min_value=1, max_value=4),
device_option=st.sampled_from([hu.cpu_do]),
use_float16=st.booleans())
@settings(deadline=10000)
def test_allreduce(self, comm_size, blob_size, num_blobs, device_option,
use_float16):
TestCase.test_counter += 1
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_allreduce,
blob_size=blob_size,
num_blobs=num_blobs,
use_float16=use_float16,
device_option=device_option)
else:
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_allreduce,
comm_size=comm_size,
blob_size=blob_size,
num_blobs=num_blobs,
device_option=device_option,
tmpdir=tmpdir,
use_float16=use_float16)
def _test_reduce_scatter(self,
comm_rank=None,
comm_size=None,
blob_size=None,
num_blobs=None,
tmpdir=None,
use_float16=False
):
store_handler, common_world = self.create_common_world(
comm_rank=comm_rank,
comm_size=comm_size,
tmpdir=tmpdir)
blob_size = self.synchronize(
store_handler,
blob_size,
comm_rank=comm_rank)
num_blobs = self.synchronize(
store_handler,
num_blobs,
comm_rank=comm_rank)
blobs = []
for i in range(num_blobs):
blob = "blob_{}".format(i)
value = np.full(blob_size, (comm_rank * num_blobs) + i,
np.float16 if use_float16 else np.float32)
workspace.FeedBlob(blob, value)
blobs.append(blob)
# Specify distribution among ranks i.e. number of elements
# scattered/distributed to each process.
recv_counts = np.zeros(comm_size, dtype=np.int32)
remaining = blob_size
chunk_size = (blob_size + comm_size - 1) / comm_size
for i in range(comm_size):
recv_counts[i] = min(chunk_size, remaining)
remaining = remaining - chunk_size if remaining > chunk_size else 0
recv_counts_blob = "recvCounts"
workspace.FeedBlob(recv_counts_blob, recv_counts)
blobs.append(recv_counts_blob)
net = core.Net("reduce_scatter")
net.ReduceScatter(
[common_world] + blobs,
blobs,
engine=op_engine)
workspace.CreateNet(net)
workspace.RunNet(net.Name())
for i in range(num_blobs):
np.testing.assert_array_equal(
np.resize(workspace.FetchBlob(blobs[i]), recv_counts[comm_rank]),
(num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
# Run the net a few more times to check the operator
# works not just the first time it's called
for _tmp in range(4):
workspace.RunNet(net.Name())
@given(comm_size=st.integers(min_value=2, max_value=8),
blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
num_blobs=st.integers(min_value=1, max_value=4),
device_option=st.sampled_from([hu.cpu_do]),
use_float16=st.booleans())
@settings(deadline=10000)
def test_reduce_scatter(self, comm_size, blob_size, num_blobs,
device_option, use_float16):
TestCase.test_counter += 1
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_reduce_scatter,
blob_size=blob_size,
num_blobs=num_blobs,
use_float16=use_float16,
device_option=device_option)
else:
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_reduce_scatter,
comm_size=comm_size,
blob_size=blob_size,
num_blobs=num_blobs,
device_option=device_option,
tmpdir=tmpdir,
use_float16=use_float16)
def _test_allgather(self,
comm_rank=None,
comm_size=None,
blob_size=None,
num_blobs=None,
tmpdir=None,
use_float16=False
):
store_handler, common_world = self.create_common_world(
comm_rank=comm_rank,
comm_size=comm_size,
tmpdir=tmpdir)
blob_size = self.synchronize(
store_handler,
blob_size,
comm_rank=comm_rank)
num_blobs = self.synchronize(
store_handler,
num_blobs,
comm_rank=comm_rank)
blobs = []
for i in range(num_blobs):
blob = "blob_{}".format(i)
value = np.full(blob_size, (comm_rank * num_blobs) + i,
np.float16 if use_float16 else np.float32)
workspace.FeedBlob(blob, value)
blobs.append(blob)
net = core.Net("allgather")
net.Allgather(
[common_world] + blobs,
["Gathered"],
engine=op_engine)
workspace.CreateNet(net)
workspace.RunNet(net.Name())
# create expected output
expected_output = np.array([])
for i in range(comm_size):
for j in range(num_blobs):
value = np.full(blob_size, (i * num_blobs) + j,
np.float16 if use_float16 else np.float32)
expected_output = np.concatenate((expected_output, value))
np.testing.assert_array_equal(
workspace.FetchBlob("Gathered"), expected_output)
# Run the net a few more times to check the operator
# works not just the first time it's called
for _tmp in range(4):
workspace.RunNet(net.Name())
@given(comm_size=st.integers(min_value=2, max_value=8),
blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
num_blobs=st.integers(min_value=1, max_value=4),
device_option=st.sampled_from([hu.cpu_do]),
use_float16=st.booleans())
@settings(max_examples=10, deadline=None)
def test_allgather(self, comm_size, blob_size, num_blobs, device_option,
use_float16):
TestCase.test_counter += 1
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_allgather,
blob_size=blob_size,
num_blobs=num_blobs,
use_float16=use_float16,
device_option=device_option)
else:
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_allgather,
comm_size=comm_size,
blob_size=blob_size,
num_blobs=num_blobs,
device_option=device_option,
tmpdir=tmpdir,
use_float16=use_float16)
@given(device_option=st.sampled_from([hu.cpu_do]))
@settings(deadline=10000)
def test_forked_cw(self, device_option):
TestCase.test_counter += 1
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_allreduce_multicw,
device_option=device_option)
else:
# Note: this test exercises the path where we fork a common world.
# We therefore don't need a comm size larger than 2. It used to be
# run with comm_size=8, which causes flaky results in a stress run.
# The flakiness was caused by too many listening sockets being
# created by Gloo context initialization (8 processes times
# 7 sockets times 20-way concurrency, plus TIME_WAIT).
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_allreduce_multicw,
comm_size=2,
device_option=device_option,
tmpdir=tmpdir)
def _test_barrier(
self,
comm_rank=None,
comm_size=None,
tmpdir=None,
):
store_handler, common_world = self.create_common_world(
comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
)
net = core.Net("barrier")
net.Barrier(
[common_world],
[],
engine=op_engine)
workspace.CreateNet(net)
workspace.RunNet(net.Name())
# Run the net a few more times to check the operator
# works not just the first time it's called
for _tmp in range(4):
workspace.RunNet(net.Name())
@given(comm_size=st.integers(min_value=2, max_value=8),
device_option=st.sampled_from([hu.cpu_do]))
@settings(deadline=10000)
def test_barrier(self, comm_size, device_option):
TestCase.test_counter += 1
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_barrier,
device_option=device_option)
else:
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_barrier,
comm_size=comm_size,
device_option=device_option,
tmpdir=tmpdir)
def _test_close_connection(
self,
comm_rank=None,
comm_size=None,
tmpdir=None,
):
'''
One node calls close connection, others wait it on barrier.
Test will check that all will exit eventually.
'''
# Caffe's for closers only:
# https://www.youtube.com/watch?v=QMFwFgG9NE8
closer = comm_rank == comm_size // 2,
store_handler, common_world = self.create_common_world(
comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
)
net = core.Net("barrier_or_close")
if not closer:
net.Barrier(
[common_world],
[],
engine=op_engine)
else:
net.DestroyCommonWorld(
[common_world], [common_world], engine=op_engine)
# Sleep a bit to ensure others start the barrier
import time
time.sleep(0.1)
workspace.CreateNet(net)
workspace.RunNet(net.Name())
@given(comm_size=st.integers(min_value=2, max_value=8),
device_option=st.sampled_from([hu.cpu_do]))
@settings(deadline=10000)
def test_close_connection(self, comm_size, device_option):
import time
start_time = time.time()
TestCase.test_counter += 1
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_close_connection,
device_option=device_option)
else:
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_close_connection,
comm_size=comm_size,
device_option=device_option,
tmpdir=tmpdir)
# Check that test finishes quickly because connections get closed.
# This assert used to check that the end to end runtime was less
# than 2 seconds, but this may not always be the case if there
# is significant overhead in starting processes. Ideally, this
# assert is replaced by one that doesn't depend on time but rather
# checks the success/failure status of the barrier that is run.
self.assertLess(time.time() - start_time, 20.0)
def _test_io_error(
self,
comm_rank=None,
comm_size=None,
tmpdir=None,
):
'''
Only one node will participate in allreduce, resulting in an IoError
'''
store_handler, common_world = self.create_common_world(
comm_rank=comm_rank,
comm_size=comm_size,
tmpdir=tmpdir)
if comm_rank == 0:
blob_size = 1000
num_blobs = 1
blobs = []
for i in range(num_blobs):
blob = "blob_{}".format(i)
value = np.full(
blob_size, (comm_rank * num_blobs) + i, np.float32
)
workspace.FeedBlob(blob, value)
blobs.append(blob)
net = core.Net("allreduce")
net.Allreduce(
[common_world] + blobs,
blobs,
engine=op_engine)
workspace.CreateNet(net)
workspace.RunNet(net.Name())
@given(comm_size=st.integers(min_value=2, max_value=8),
device_option=st.sampled_from([hu.cpu_do]))
@settings(deadline=10000)
def test_io_error(self, comm_size, device_option):
TestCase.test_counter += 1
with self.assertRaises(IoError):
if os.getenv('COMM_RANK') is not None:
self.run_test_distributed(
self._test_io_error,
device_option=device_option)
else:
with TemporaryDirectory() as tmpdir:
self.run_test_locally(
self._test_io_error,
comm_size=comm_size,
device_option=device_option,
tmpdir=tmpdir)
if __name__ == "__main__":
import unittest
unittest.main()

View File

@ -1,15 +0,0 @@
#include <gloo/common/error.h>
#include <pybind11/pybind11.h>
namespace gloo {
namespace python {
namespace py = pybind11;
PYBIND11_MODULE(python, m) {
m.doc() = "Python interface for Gloo";
py::register_exception<IoException>(m, "IoError");
}
} // namespace python
} // namespace gloo

View File

@ -1,53 +0,0 @@
/**
* Copyright (c) 2018-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "reduce_scatter_ops.h"
#include <gloo/reduce_scatter.h>
#include <gloo/types.h>
namespace caffe2 {
namespace gloo {
template <class Context>
void ReduceScatterOp<Context>::initializeHalvingDoubling() {
if (init_.template IsType<float>()) {
algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<float>(
init_.context,
init_.template getOutputs<float>(),
init_.size,
recvCounts_));
} else if (init_.template IsType<::at::Half>()) {
algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<::gloo::float16>(
init_.context,
init_.template getOutputs<::gloo::float16>(),
init_.size,
recvCounts_));
} else {
CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
}
}
namespace {
REGISTER_CPU_OPERATOR_WITH_ENGINE(
ReduceScatter,
GLOO,
ReduceScatterOp<CPUContext>);
} // namespace
} // namespace gloo
} // namespace caffe2

View File

@ -1,131 +0,0 @@
/**
* Copyright (c) 2018-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <algorithm>
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include <gloo/algorithm.h>
#include <gloo/common/error.h>
#include <gloo/context.h>
namespace caffe2 {
namespace gloo {
template <class Context>
class ReduceScatterOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
ReduceScatterOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
ws_(ws),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
}
~ReduceScatterOp() override {}
bool RunOnDevice() override {
std::call_once(once_, [&] { initialize(); });
// If any parameter has changed in between runs, the initialized
// algorithm is invalid and cannot be used.
update(current_);
CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
try {
algorithm_->run();
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ioe);
return false;
} else {
throw;
}
}
return true;
}
protected:
void initialize() {
// Store which inputs/outputs this instance initialized with
update(init_);
// Verify inputs == outputs
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
for (const auto i : c10::irange(init_.inputs.size())) {
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
}
// Verify tensors all have same size
size_t size = Input(1).numel();
for (auto i = 2; i < InputSize() - 1; i++) {
CAFFE_ENFORCE_EQ(Input(i).numel(), size);
}
// Verify tensors all have same type
TypeMeta meta = Input(1).dtype();
for (auto i = 2; i < InputSize() - 1; i++) {
CAFFE_ENFORCE(Input(i).dtype() == meta);
}
initializeHalvingDoubling();
}
void initializeHalvingDoubling();
std::once_flag once_;
std::unique_ptr<::gloo::Algorithm> algorithm_;
// Captures the parameters passed to Gloo when first initialized.
// An instance is updated every time this op runs and is compared
// to the reference instance for equality. If any parameter has
// changed from run to run, the initialized algorithm is invalid.
void update(GlooParameters& params) {
params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
params.inputs.resize(InputSize() - 2);
params.outputs.resize(OutputSize() - 1);
for (const auto i : c10::irange(params.inputs.size())) {
params.inputs[i] = Input(i + 1).raw_data();
params.outputs[i] = Output(i)->raw_mutable_data();
}
params.size = Output(0)->numel();
params.meta = Output(0)->dtype();
// Verify recvCountsSize == comm_size
CAFFE_ENFORCE_EQ(Input(InputSize() - 1).numel(), params.context->size);
int* recvCounts = (int*)Input(InputSize() - 1).raw_data();
recvCounts_.assign(recvCounts, recvCounts + Input(InputSize() - 1).numel());
}
GlooParameters init_;
GlooParameters current_;
Workspace* ws_;
std::string status_blob_;
std::vector<int> recvCounts_;
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,25 +0,0 @@
#include "store_handler.h"
namespace caffe2 {
namespace gloo {
void StoreHandlerWrapper::set(
const std::string& key,
const std::vector<char>& data) {
std::string stringValue(data.data(), data.size());
handler_.set(key, stringValue);
}
std::vector<char> StoreHandlerWrapper::get(const std::string& key) {
std::string str = handler_.get(key);
return std::vector<char>(str.begin(), str.end());
}
void StoreHandlerWrapper::wait(
const std::vector<std::string>& keys,
const std::chrono::milliseconds& timeout) {
handler_.wait(keys, timeout);
}
} // namespace gloo
} // namespace caffe2

View File

@ -1,35 +0,0 @@
#pragma once
#include "caffe2/core/common.h"
#include "caffe2/distributed/store_handler.h"
#include <gloo/rendezvous/store.h>
namespace caffe2 {
namespace gloo {
class TORCH_API StoreHandlerWrapper : public ::gloo::rendezvous::Store {
public:
explicit StoreHandlerWrapper(StoreHandler& handler) : handler_(handler) {}
virtual ~StoreHandlerWrapper() override {}
virtual void set(const std::string& key, const std::vector<char>& data)
override;
std::vector<char> get(const std::string& key) override;
void wait(const std::vector<std::string>& keys) override {
wait(keys, ::gloo::rendezvous::Store::kDefaultTimeout);
}
virtual void wait(
const std::vector<std::string>& keys,
const std::chrono::milliseconds& timeout) override;
protected:
StoreHandler& handler_;
};
} // namespace gloo
} // namespace caffe2

View File

@ -1,20 +0,0 @@
if(USE_MKLDNN)
message(STATUS "Including IDEEP operators")
# ---[ CPU files.
file(GLOB_RECURSE tmp *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
# exclude test files and gpu files
file(GLOB_RECURSE tmp *_test.cc)
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
# ---[ CPU test files - currently none but just to be safe
file(GLOB_RECURSE tmp *_test.cc)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
# ---[ Send the lists to the parent scope.
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
else()
message(STATUS "Excluding ideep operators as we are not using ideep")
endif()

View File

@ -1,25 +0,0 @@
if(USE_NCCL)
if(USE_CUDA)
message(STATUS "Include NCCL operators")
set(Caffe2_CONTRIB_NCCL_GPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_op_gpu.cc"
)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
endif(USE_CUDA)
if(USE_ROCM)
message(STATUS "Include AMD RCCL operators")
set(Caffe2_CONTRIB_NCCL_HIP_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_nccl_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_nccl_op_gpu.cc"
)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_NCCL_HIP_SRC})
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
endif(USE_ROCM)
else()
message(STATUS "NCCL operators skipped due to no CUDA support")
endif()

View File

@ -1,322 +0,0 @@
#include "caffe2/contrib/nccl/cuda_nccl_gpu.h"
namespace caffe2 {
namespace nccl {
namespace {
std::vector<int> getDevices(const NCCLExecution& ex) {
std::vector<int> result;
result.reserve(ex.elements.size());
for (const auto& el : ex.elements) {
result.push_back(el.device);
}
return result;
}
class NCCLContext {
public:
explicit NCCLContext(const NCCLExecution& ex)
: devices_(getDevices(ex)), master_gpu_id_(ex.stream_gpu_id) {
comms_.resize(devices_.size());
CAFFE_NCCL_CHECK(
ncclCommInitAll(comms_.data(), devices_.size(), devices_.data()));
streams_.resize(devices_.size());
events_.resize(devices_.size());
for (auto i = 0U; i < devices_.size(); ++i) {
CUDAGuard g(devices_[i]);
// get stream priorities
int lo_pri, hi_pri;
CUDA_ENFORCE(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
CUDA_ENFORCE(cudaStreamCreateWithPriority(
&streams_[i], cudaStreamNonBlocking, hi_pri));
CUDA_ENFORCE(cudaEventCreateWithFlags(
&events_[i], cudaEventDefault | cudaEventDisableTiming));
}
CUDAGuard g(master_gpu_id_);
CUDA_ENFORCE(cudaEventCreateWithFlags(
&master_event_, cudaEventDefault | cudaEventDisableTiming));
}
~NCCLContext() {
for (auto i = 0U; i < devices_.size(); ++i) {
CUDAGuard g(devices_[i]);
CUDA_ENFORCE(cudaStreamDestroy(streams_[i]));
CUDA_ENFORCE(cudaEventDestroy(events_[i]));
}
CUDAGuard g(master_gpu_id_);
CUDA_ENFORCE(cudaEventDestroy(master_event_));
for (auto& comm : comms_) {
ncclCommDestroy(comm);
}
}
std::vector<int> devices_;
std::vector<ncclComm_t> comms_;
std::vector<cudaStream_t> streams_;
int master_gpu_id_;
cudaEvent_t master_event_;
std::vector<cudaEvent_t> events_;
C10_DISABLE_COPY_AND_ASSIGN(NCCLContext);
};
// We share the contexts across multiple operators, hence the cache.
static std::mutex& gContextsMutex() {
static std::mutex m;
return m;
}
std::unordered_map<std::string, std::unique_ptr<NCCLContext>>& gContexts() {
static std::unordered_map<std::string, std::unique_ptr<NCCLContext>> m;
return m;
}
std::string ncclKey(const NCCLExecution& ex) {
std::string result;
int curr_device;
CUDA_CHECK(cudaGetDevice(&curr_device));
result += to_string(curr_device) + ":";
for (const auto& el : ex.elements) {
result += to_string(el.device) + ",";
}
return result;
}
NCCLContext* getNCCLContext(const NCCLExecution& ex) {
auto& contexts = gContexts();
const auto key = ncclKey(ex);
if (!contexts[key]) {
LOG(INFO) << "Creating NCCLContext for key: " << key;
contexts[key].reset(new NCCLContext(ex));
}
return TORCH_CHECK_NOTNULL(contexts[key].get());
}
template <typename T>
class ncclTypeWrapper;
template <>
class ncclTypeWrapper<float> {
public:
static const ncclDataType_t type = ncclFloat;
};
template <>
class ncclTypeWrapper<int> {
public:
static const ncclDataType_t type = ncclInt;
};
#ifdef CAFFE_HAS_CUDA_FP16
template <>
class ncclTypeWrapper<at::Half> {
public:
static const ncclDataType_t type = ncclHalf;
};
#endif
template <typename T, typename InitF, typename F>
void runNCCL(const NCCLExecution& ex, InitF&& init_f, F&& f) {
// do initialization
for (auto i = 0U; i < ex.elements.size(); ++i) {
auto& ctx = ex.elements[i];
CUDAGuard g(ctx.device);
init_f(ex.elements[i]);
}
std::lock_guard<std::mutex> g(gContextsMutex());
auto* context = getNCCLContext(ex);
auto& comms = context->comms_;
auto& streams = context->streams_;
auto& events = context->events_;
// Record an event on the master context, wait on it in each of the
// children streams, so the children streams are synchronized WRT
// the original stream.
{
CUDAGuard g(ex.stream_gpu_id);
CUDA_ENFORCE(cudaEventRecord(context->master_event_, ex.stream));
}
{
// lock out alloc / free while NCCL launches
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
#if NCCL_VERSION_MIN(2, 0, 0)
CAFFE_NCCL_CHECK(ncclGroupStart());
#endif
for (auto i = 0U; i < ex.elements.size(); ++i) {
auto& ctx = ex.elements[i];
CUDAGuard g(ctx.device);
auto& comm = comms[i];
auto& stream = streams[i];
TORCH_DCHECK_EQ(ctx.device, GetGPUIDForPointer(ctx.src->raw_data()));
CUDA_ENFORCE(cudaStreamWaitEvent(stream, context->master_event_, 0));
f(ctx, comm, stream);
}
#if NCCL_VERSION_MIN(2, 0, 0)
CAFFE_NCCL_CHECK(ncclGroupEnd());
#endif
for (auto i = 0U; i < ex.elements.size(); ++i) {
auto& ctx = ex.elements[i];
CUDAGuard g(ctx.device);
auto& stream = streams[i];
auto& event = events[i];
// Record an event on each children stream that we have finished
// our computation
CUDA_ENFORCE(cudaEventRecord(event, stream));
}
}
// Now, wait on all the events in the original stream.
CUDAGuard dg(ex.stream_gpu_id);
for (auto& event : events) {
CUDA_ENFORCE(cudaStreamWaitEvent(TORCH_CHECK_NOTNULL(ex.stream), event, 0));
}
}
} // namespace
void destroyContexts() {
std::lock_guard<std::mutex> g(gContextsMutex());
auto& contexts = gContexts();
contexts.clear();
}
template <typename T>
void NCCL<T>::AllReduce(const NCCLExecution& ex) {
return runNCCL<T>(
ex,
[](const NCCLElement& ctx) {
ctx.dst->Resize(ctx.src->sizes());
ctx.dst->template mutable_data<T>();
},
[](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
CAFFE_NCCL_CHECK(ncclAllReduce(
ctx.src->raw_data(),
ctx.dst->raw_mutable_data(),
ctx.dst->numel(),
ncclTypeWrapper<T>::type,
ncclSum,
comm,
stream));
});
}
template <typename T>
void NCCL<T>::Broadcast(const NCCLExecution& ex) {
return runNCCL<T>(
ex,
[](const NCCLElement& ctx) {
ctx.dst->Resize(ctx.src->sizes());
ctx.dst->template mutable_data<T>();
},
[&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
CAFFE_NCCL_CHECK(ncclBcast(
ctx.dst->raw_mutable_data(),
ctx.dst->numel(),
ncclTypeWrapper<T>::type,
ex.root,
comm,
stream));
});
}
template <typename T>
void NCCL<T>::Reduce(const NCCLExecution& ex) {
return runNCCL<T>(
ex,
[](const NCCLElement& ctx) {
if (ctx.dst) {
ctx.dst->Resize(ctx.src->sizes());
ctx.dst->template mutable_data<T>();
}
},
[&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
CAFFE_NCCL_CHECK(ncclReduce(
ctx.src->raw_data(),
ctx.dst ? ctx.dst->raw_mutable_data() : nullptr,
ctx.src->numel(),
ncclTypeWrapper<T>::type,
ncclSum,
ex.root,
comm,
stream));
});
}
template <typename T>
void NCCL<T>::AllGather(const NCCLExecution& ex) {
const auto n = ex.elements.size();
return runNCCL<T>(
ex,
[n](const NCCLElement& ctx) {
CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
std::vector<int64_t> dims;
dims.reserve(ctx.src->dim() + 1);
dims.push_back(n);
for (auto d : ctx.src->sizes()) {
dims.push_back(d);
}
ctx.dst->Resize(dims);
ctx.dst->template mutable_data<T>();
},
[](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
#if NCCL_VERSION_MIN(2, 0, 0)
CAFFE_NCCL_CHECK(ncclAllGather(
ctx.src->raw_data(),
ctx.dst->raw_mutable_data(),
ctx.src->numel(),
ncclTypeWrapper<T>::type,
comm,
stream));
#else
CAFFE_NCCL_CHECK(ncclAllGather(
ctx.src->raw_data(),
ctx.src->size(),
ncclTypeWrapper<T>::type,
ctx.dst->raw_mutable_data(),
comm,
stream));
#endif
});
}
template <typename T>
void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
return runNCCL<T>(
ex,
[](const NCCLElement& ctx) {
CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
const auto& srcDims = ctx.src->sizes();
std::vector<int64_t> dstDims(srcDims.begin() + 1, srcDims.end());
ctx.dst->Resize(dstDims);
ctx.dst->template mutable_data<T>();
},
[](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
CAFFE_NCCL_CHECK(ncclReduceScatter(
ctx.src->raw_data(),
ctx.dst->raw_mutable_data(),
ctx.dst->numel(),
ncclTypeWrapper<T>::type,
ncclSum,
comm,
stream));
});
}
// Explicit instantiation
template class NCCL<float>;
template class NCCL<int>;
#ifdef CAFFE_HAS_CUDA_FP16
template class NCCL<at::Half>;
#endif
} // namespace nccl
} // namespace caffe2

View File

@ -1,63 +0,0 @@
#pragma once
#include <cstddef>
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/logging.h"
#include <nccl.h>
#include <unordered_map>
#define NCCL_VERSION_MIN(major, minor, patch) \
((NCCL_MAJOR > major) || \
((NCCL_MAJOR == major) && \
((NCCL_MINOR > minor) || \
((NCCL_MINOR == minor) && (NCCL_PATCH >= patch)))))
namespace caffe2 {
namespace nccl {
#define CAFFE_NCCL_CHECK(condition) \
do { \
ncclResult_t status = (condition); \
CAFFE_ENFORCE_EQ( \
status, \
ncclSuccess, \
" ", \
"Error at: ", \
__FILE__, \
__LINE__, \
": ", \
ncclGetErrorString(status)); \
} while (0)
struct NCCLElement {
const TensorCUDA* src{nullptr};
TensorCUDA* dst{nullptr};
int device{0};
};
struct NCCLExecution {
int stream_gpu_id{0};
cudaStream_t stream{nullptr};
std::vector<NCCLElement> elements;
size_t root{0};
};
// Called when the last NCCL op is destructed and all lazily created
// NCCLContext instances can safely be destroyed.
void destroyContexts();
template <typename T>
class NCCL {
public:
static void AllReduce(const NCCLExecution& ex);
static void Broadcast(const NCCLExecution& ex);
static void Reduce(const NCCLExecution& ex);
static void AllGather(const NCCLExecution& ex);
static void ReduceScatter(const NCCLExecution& ex);
};
} // namespace nccl
} // namespace caffe2

View File

@ -1,275 +0,0 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "caffe2/contrib/nccl/cuda_nccl_gpu.h"
namespace caffe2 {
nccl::NCCLExecution getNCCLElements(
OperatorBase* op,
const CUDAContext& context) {
// We either do an N-N op, or an N-1 op.
CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
nccl::NCCLExecution ex;
ex.stream_gpu_id = context.device_id();
ex.stream = context.cuda_stream();
ex.root = op->template GetSingleArgument<int>("root", 0);
ex.elements.resize(op->InputSize());
for (auto i = 0; i < op->InputSize(); ++i) {
auto& el = ex.elements[i];
el.src = &(op->Input<Tensor>(i, CUDA));
if (op->OutputSize() == 1) {
// Reduce op
if (i == ex.root) {
el.dst = op->Output<Tensor>(0, CUDA);
}
} else if (i < op->OutputSize()) {
el.dst = op->Output<Tensor>(i, CUDA);
}
// TODO - expensive (>1ms) - cache these.
el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
}
return ex;
}
namespace {
// Check if all inputs are float
template <typename T>
bool AllInputsAre(OperatorBase* op) {
for (auto i = 0; i < op->InputSize(); ++i) {
if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
continue;
} else {
return false;
}
}
return true;
}
// Manual count of all instantiated NCCL ops.
// If this drops to zero after destructing the last NCCL op,
// it means we can safely destroy all lazily created NCCL contexts.
std::atomic<int> kNCCLOpCounter(0);
}; // namespace
class NCCLBaseOp : public Operator<CUDAContext> {
public:
using Operator::Operator;
NCCLBaseOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CUDAContext>(operator_def, ws) {
kNCCLOpCounter++;
}
~NCCLBaseOp() {
if (--kNCCLOpCounter == 0) {
nccl::destroyContexts();
}
}
};
class NCCLAllreduceOp final : public NCCLBaseOp {
public:
using NCCLBaseOp::NCCLBaseOp;
bool RunOnDevice() override {
if (InputSize() == 1)
return true;
if (AllInputsAre<float>(this)) {
nccl::NCCL<float>::AllReduce(getNCCLElements(this, context_));
return true;
} else if (AllInputsAre<at::Half>(this)) {
nccl::NCCL<at::Half>::AllReduce(getNCCLElements(this, context_));
return true;
} else {
return false;
}
}
static std::vector<TensorShape> ShapeInference(
const OperatorDef& def,
const std::vector<TensorShape>& in) {
auto n_outputs = def.output_size();
CAFFE_ENFORCE(
n_outputs == 1 || n_outputs == in.size(),
"NCCLAllreduce only supports N-1 or N-N reductions");
for (auto i = 0; i < in.size(); i++) {
CAFFE_ENFORCE(
in[0].dims_size() == in[i].dims_size(),
"NCCLAllreduce requires inputs of same dimension");
for (auto j = 0; j < in[0].dims_size(); j++) {
CAFFE_ENFORCE(
in[0].dims(j) == in[i].dims(j),
"NCCLAllreduce requires inputs to be of same shape");
}
}
std::vector<TensorShape> out(n_outputs);
for (auto i = 0; i < out.size(); i++) {
out[i] = in[0];
}
return out;
}
static struct OpSchema::Cost CostInference(
const OperatorDef& def,
const vector<TensorShape>& inputs) {
CAFFE_ENFORCE_GE(inputs.size(), 1, "Conv requires at least 1 input");
const TensorShape X0 = inputs[0];
const auto nElem = nElemFromDim(inputs[0]);
struct OpSchema::Cost c;
c.flops = (inputs.size() - 1) * nElem;
c.bytes_read = inputs.size() * nElem;
c.bytes_written = def.output_size() * nElem;
c.params_bytes = 0;
return c;
}
};
class NCCLBroadcastOp final : public NCCLBaseOp {
public:
using NCCLBaseOp::NCCLBaseOp;
bool RunOnDevice() override {
if (InputSize() == 1)
return true;
if (AllInputsAre<float>(this)) {
nccl::NCCL<float>::Broadcast(getNCCLElements(this, context_));
return true;
} else if (AllInputsAre<at::Half>(this)) {
nccl::NCCL<at::Half>::Broadcast(getNCCLElements(this, context_));
return true;
} else {
return false;
}
}
};
class NCCLReduceOp final : public NCCLBaseOp {
public:
using NCCLBaseOp::NCCLBaseOp;
bool RunOnDevice() override {
if (InputSize() == 1)
return true;
const auto& ex = getNCCLElements(this, context_);
if (AllInputsAre<float>(this)) {
nccl::NCCL<float>::Reduce(ex);
return true;
} else if (AllInputsAre<at::Half>(this)) {
nccl::NCCL<at::Half>::Reduce(ex);
return true;
} else {
return false;
}
}
};
class NCCLAllGatherOp final : public NCCLBaseOp {
public:
using NCCLBaseOp::NCCLBaseOp;
bool RunOnDevice() override {
if (InputSize() == 1)
return true;
if (AllInputsAre<float>(this)) {
nccl::NCCL<float>::AllGather(getNCCLElements(this, context_));
return true;
} else if (AllInputsAre<at::Half>(this)) {
nccl::NCCL<at::Half>::AllGather(getNCCLElements(this, context_));
return true;
} else {
return false;
}
}
};
class NCCLReduceScatterOp final : public NCCLBaseOp {
public:
using NCCLBaseOp::NCCLBaseOp;
bool RunOnDevice() override {
if (AllInputsAre<float>(this)) {
nccl::NCCL<float>::ReduceScatter(getNCCLElements(this, context_));
return true;
} else if (AllInputsAre<at::Half>(this)) {
nccl::NCCL<at::Half>::ReduceScatter(getNCCLElements(this, context_));
return true;
} else {
return false;
}
}
};
namespace {
std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
const OperatorDef& def) {
std::vector<DeviceOption> opt;
for (int i = 0; i < def.input().size(); ++i) {
DeviceOption dev;
dev.set_device_type(1);
dev.set_device_id(i);
opt.push_back(dev);
}
return std::make_pair(opt, opt);
}
REGISTER_CUDA_OPERATOR(NCCLAllreduce, NCCLAllreduceOp);
OPERATOR_SCHEMA(NCCLAllreduce)
.NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
.NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
.CostInferenceFunction(NCCLAllreduceOp::CostInference)
.TensorInferenceFunction(NCCLAllreduceOp::ShapeInference)
.IdenticalTypeAndShape()
.InputsCanCrossDevices()
.AllowOneToOneInplace()
.DeviceInferenceFunction(ncclOpDevInfer);
SHOULD_NOT_DO_GRADIENT(NCCLAllreduce);
REGISTER_CUDA_OPERATOR(NCCLBroadcast, NCCLBroadcastOp);
OPERATOR_SCHEMA(NCCLBroadcast)
.NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
.NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
.IdenticalTypeAndShape()
.InputsCanCrossDevices()
.EnforceOneToOneInplace()
.DeviceInferenceFunction(ncclOpDevInfer);
SHOULD_NOT_DO_GRADIENT(NCCLBroadcast);
REGISTER_CUDA_OPERATOR(NCCLReduce, NCCLReduceOp);
OPERATOR_SCHEMA(NCCLReduce)
.NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
.NumOutputs(1)
.IdenticalTypeAndShapeOfInput(0)
.InputsCanCrossDevices()
.AllowInplace([](int /*in*/, int out) -> bool { return (out == 0); })
.DeviceInferenceFunction(ncclOpDevInfer);
SHOULD_NOT_DO_GRADIENT(NCCLReduce);
REGISTER_CUDA_OPERATOR(NCCLAllGather, NCCLAllGatherOp);
OPERATOR_SCHEMA(NCCLAllGather)
.NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
.NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
.InputsCanCrossDevices()
.DeviceInferenceFunction(ncclOpDevInfer);
SHOULD_NOT_DO_GRADIENT(NCCLAllGather);
REGISTER_CUDA_OPERATOR(NCCLReduceScatter, NCCLReduceScatterOp);
OPERATOR_SCHEMA(NCCLReduceScatter)
.NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
.NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
.InputsCanCrossDevices()
.DeviceInferenceFunction(ncclOpDevInfer);
SHOULD_NOT_DO_GRADIENT(NCCLReduceScatter);
} // namespace
} // namespace caffe2

View File

@ -1,192 +0,0 @@
import unittest
import hypothesis.strategies as st
from hypothesis import given, assume
import numpy as np
import time
import os
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace, muji, dyndep
import caffe2.python.hypothesis_test_util as hu
np.random.seed(1)
dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/nccl:nccl_ops')
def gpu_device(i):
device_option = caffe2_pb2.DeviceOption()
device_option.device_type = workspace.GpuDeviceType
device_option.device_id = i
return device_option
def benchmark(ws, net, warmups=5, iters=100):
for _ in range(warmups):
ws.run(net)
plan = core.Plan("plan")
plan.AddStep(core.ExecutionStep("test-step", net, iters))
before = time.time()
ws.run(plan)
after = time.time()
print("Timing network, time taken per-iteration: {:.6f}ms".format((
after - before) / float(iters) * 1000.0))
return after - before
@unittest.skipIf(not workspace.has_cuda_support, "NCCL only on CUDA GPU")
class NCCLOpsTest(hu.HypothesisTestCase):
@given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
m=st.integers(min_value=1, max_value=1000),
in_place=st.booleans())
def test_nccl_allreduce(self, n, m, in_place):
xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
inputs = [str("x_{}".format(i)) for i in range(n)]
prefix = "" if in_place else "o"
outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
op = core.CreateOperator("NCCLAllreduce", inputs, outputs)
input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
def allreduce(*args):
assert len(args) == n
output = np.sum(args, axis=0)
return [output for _ in range(n)]
outputs = self.assertReferenceChecks(
hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
allreduce, input_device_options)
for output in outputs:
np.testing.assert_array_equal(outputs[0], output)
self.assertEqual(outputs[0].tobytes(), output.tobytes())
@given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
m=st.integers(min_value=1, max_value=1000),
root=st.integers(min_value=0,
max_value=workspace.NumGpuDevices() - 1))
def test_nccl_broadcast(self, n, m, root):
assume(root < n)
xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
inputs = [str("x_{}".format(i)) for i in range(n)]
op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root)
input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
def broadcast(*args):
assert len(args) == n
return [args[root] for _ in range(n)]
self.assertReferenceChecks(
hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
broadcast, input_device_options)
@given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
m=st.integers(min_value=1, max_value=1000),
# NCCL Reduce seems to deadlock for non-zero roots.
root=st.integers(min_value=0, max_value=0),
in_place=st.booleans())
def test_nccl_reduce(self, n, m, root, in_place):
assume(in_place is False or root == 0)
xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
inputs = [str("x_{}".format(i)) for i in range(n)]
op = core.CreateOperator(
"NCCLReduce", inputs,
inputs[root] if in_place else b"o", root=root)
input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
def reduce(*args):
assert len(args) == n
return [np.sum(args, axis=0)]
self.assertReferenceChecks(
hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
reduce, input_device_options)
@given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
m=st.integers(min_value=1, max_value=1000))
def test_nccl_allgather(self, n, m):
xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
inputs = [str("x_{}".format(i)) for i in range(n)]
outputs = [str("o_{}".format(i)) for i in range(n)]
op = core.CreateOperator("NCCLAllGather", inputs, outputs)
input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
def allgather(*args):
assert len(args) == n
return [np.stack(args, axis=0) for _ in range(n)]
outputs = self.assertReferenceChecks(
hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
allgather, input_device_options)
for output in outputs:
np.testing.assert_array_equal(outputs[0], output)
self.assertEqual(outputs[0].tobytes(), output.tobytes())
@given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
m=st.integers(min_value=1, max_value=1000))
def test_nccl_reduce_scatter(self, n, m):
xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)]
inputs = [str("x_{}".format(i)) for i in range(n)]
outputs = [str("o_{}".format(i)) for i in range(n)]
op = core.CreateOperator("NCCLReduceScatter", inputs, outputs)
input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
def reduce_scatter(*args):
assert len(args) == n
reduced = sum(args)
assert len(reduced.shape) > 1
ref = [reduced[i, :] for i in range(n)]
return ref
self.assertReferenceChecks(
hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
reduce_scatter, input_device_options)
@given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
m=st.integers(min_value=100000, max_value=100000),
iters=st.integers(min_value=1, max_value=100),
net_type=st.sampled_from(["dag", "async_dag", "simple"]))
def _test_nccl_sync(self, n, m, iters, net_type):
inputs = [str("x_{}".format(i)) for i in range(n)]
extra_inputs = [str("xe_{}".format(i)) for i in range(n)]
net = core.Net("asdf")
net.Proto().type = net_type
net.Proto().num_workers = n
for i in range(n):
net.ConstantFill([], inputs[i], shape=[m], value=0.0,
device_option=gpu_device(i))
net.ConstantFill([], extra_inputs[i], shape=[m], value=1.0,
device_option=gpu_device(i))
for _ in range(iters):
net.Sum([inputs[i], extra_inputs[i]], [inputs[i]],
device_option=gpu_device(i))
net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0))
self.ws.run(net)
np.testing.assert_array_equal(
self.ws.blobs[inputs[0]].fetch(),
np.full(shape=(m,), fill_value=iters * n, dtype=np.float32))
@unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
def test_timings(self):
for n in range(2, workspace.NumGpuDevices()):
for in_place in [False, True]:
xs = [np.random.randn(1e7).astype(np.float32)
for i in range(n)]
inputs = [str("x_{}".format(i)) for i in range(n)]
prefix = "" if in_place else "o"
outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
net = core.Net("test")
net.NCCLAllreduce(inputs, outputs)
net.RunAllOnGPU()
for i in range(n):
self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
self.ws.run(net)
net_time = benchmark(self.ws, net)
vanilla = core.Net("vanilla")
muji.Allreduce(vanilla, inputs)
vanilla_time = benchmark(self.ws, vanilla)
print("Speedup for NCCL: {:.2f}".format(
vanilla_time / net_time))

View File

@ -1,352 +0,0 @@
#include "caffe2/core/common.h"
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/operators/leaky_relu_op.h"
#include "caffe2/utils/cpuid.h"
#include "caffe2/utils/math.h"
#include "nnpack.h"
C10_DEFINE_int(
caffe2_nnpack_num_threads,
1,
"The number of nnpack pthreadpool threads.");
C10_DEFINE_bool(
caffe2_nnpack_use_mkl_num_threads,
true,
"If MKL is built, this sets nnpack to use the same number of threads as "
"MKL does. This overrides caffe2_nnpack_num_threads if set.");
namespace caffe2 {
////////////////////////////////////////////////////////////////////////////////
// Helper Functions
////////////////////////////////////////////////////////////////////////////////
namespace {
bool has_nnpack() {
// nnp_initialize is a noop after the first call so it's safe to invoke it
// repeatedly
auto nnpack_status = nnp_initialize();
return nnpack_status == nnp_status_success;
}
nnp_convolution_algorithm get_nnp_convolution_algorithm(
const std::string& algo) {
if (algo == "AUTO") {
return nnp_convolution_algorithm_auto;
}
if (algo == "WINOGRAD") {
return nnp_convolution_algorithm_wt8x8;
}
if (algo == "FT16") {
return nnp_convolution_algorithm_ft16x16;
}
if (algo == "FT8") {
return nnp_convolution_algorithm_ft8x8;
}
return nnp_convolution_algorithm_auto;
}
nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy(
const std::string& kts) {
if (kts == "BLOCK") {
return nnp_convolution_transform_strategy_block_based;
}
if (kts == "TUPLE") {
return nnp_convolution_transform_strategy_tuple_based;
}
return nnp_convolution_transform_strategy_block_based;
}
////////////////////////////////////////////////////////////////////////////////
// Thread Pool
////////////////////////////////////////////////////////////////////////////////
static pthreadpool_t nnpack_threadpool_ = nullptr;
pthreadpool_t nnpack_threadpool() {
if (nnpack_threadpool_ == nullptr) {
enum nnp_status nnpack_status = nnp_initialize();
CAFFE_ENFORCE(
nnpack_status == nnp_status_success, "NNPack is not supported here!");
int num_threads = FLAGS_caffe2_nnpack_num_threads;
if (FLAGS_caffe2_nnpack_use_mkl_num_threads) {
#ifdef CAFFE2_USE_MKL
num_threads = mkl_get_max_threads();
#else
VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this "
"Caffe2 is not built with MKL. Skipping.";
#endif
}
nnpack_threadpool_ = pthreadpool_create(num_threads);
}
return nnpack_threadpool_;
}
}
////////////////////////////////////////////////////////////////////////////////
// NNPACK Ops
////////////////////////////////////////////////////////////////////////////////
class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
public:
NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws),
algo_(get_nnp_convolution_algorithm(
OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
kts_(get_nnp_convolution_transform_strategy(
OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
OPERATOR_NEEDS_FEATURE(
this->order_ == StorageOrder::NCHW,
"NNPack only supports NCHW order. Please consider adding "
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
OPERATOR_NEEDS_FEATURE(
dilation_h() == 1 && dilation_w() == 1,
"The NNPack convolution does not support dilation yet.");
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDeviceWithOrderNCHW() override {
auto& X = Input(0);
auto& filter = Input(1);
auto& bias = Input(2);
auto* Y = Output(0);
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
const int M = filter.dim32(0);
CAFFE_ENFORCE(X.dim() == 4, "Input dim should be 4");
CAFFE_ENFORCE(filter.dim(), 4);
CAFFE_ENFORCE(C % this->group_ == 0, "");
CAFFE_ENFORCE(M % this->group_ == 0, "");
CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
CAFFE_ENFORCE(bias.numel() == M, "");
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
const int oH = Y->dim32(2), oW = Y->dim32(3);
if (N > 1) {
CAFFE_ENFORCE_EQ(
this->stride_h(),
1,
"NNPack only supports stride = 1 when doing batch feedforward");
CAFFE_ENFORCE_EQ(
this->stride_w(),
1,
"NNPack only supports stride = 1 when doing batch feedforward");
}
std::vector<int> pads(
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
std::vector<int> stride({this->stride_h(), this->stride_w()});
const size_t input_channels = X.dim32(1);
const size_t output_channels = Y->dim32(1);
const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
.height = static_cast<size_t>(X.dim32(2))};
// filter is MCHW
const nnp_size kernel_size = {
.width = static_cast<size_t>(filter.dim32(3)),
.height = static_cast<size_t>(filter.dim32(2))};
// pad is tblr
const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
.right = static_cast<size_t>(pads[3]),
.bottom = static_cast<size_t>(pads[1]),
.left = static_cast<size_t>(pads[2])};
const nnp_size output_subsample = {
.width = static_cast<size_t>(stride[1]),
.height = static_cast<size_t>(stride[0])};
if (N == 1) {
VLOG(1) << "Running inference mode";
for (auto g = 0; g < group_; ++g) {
const auto status = nnp_convolution_inference(
algo_,
kts_,
C / group_,
M / group_,
input_size,
padding,
kernel_size,
output_subsample,
X.template data<float>() + g * H * W * (C / group_),
filter.template data<float>() + filter.numel() / group_ * g,
bias.template data<float>() + bias.numel() / group_ * g,
Y->template mutable_data<float>() + g * oH * oW * (M / group_),
nnpack_threadpool(),
nullptr);
CAFFE_ENFORCE(nnp_status_success == status, "");
}
} else {
VLOG(1) << "Running batched mode";
for (auto g = 0; g < group_; ++g) {
const auto status = nnp_convolution_output(
algo_,
N,
C / group_,
M / group_,
input_size,
padding,
kernel_size,
X.template data<float>() + g * H * W * (C / group_),
filter.template data<float>() + filter.numel() / group_ * g,
bias.template data<float>() + bias.numel() / group_ * g,
Y->template mutable_data<float>() + g * oH * oW * (M / group_),
nnpack_threadpool(),
nullptr);
CAFFE_ENFORCE(nnp_status_success == status, "");
}
}
return true;
}
private:
const nnp_convolution_algorithm algo_;
const nnp_convolution_transform_strategy kts_;
};
class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
public:
NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
this->order_ == StorageOrder::NCHW,
"NNPack only supports NCHW order. Please consider add "
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
OPERATOR_NEEDS_FEATURE(
this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->pad_t() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_l() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_r() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_b() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDeviceWithOrderNCHW() override {
auto& X = Input(0);
auto* Y = Output(0);
CAFFE_ENFORCE(X.dim() == 4, "");
const int H = X.dim32(2), W = X.dim32(3);
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
std::vector<int> pads(
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
std::vector<int> stride({this->stride_h(), this->stride_w()});
std::vector<int> pooling({this->kernel_h(), this->kernel_w()});
// Input X is in NCHW order
const size_t batch_size = X.dim32(0);
const size_t input_channels = X.dim32(1);
const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
.height = static_cast<size_t>(X.dim32(2))};
// pooling kernel
const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]),
.height = static_cast<size_t>(pooling[0])};
// pad is tblr
const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
.right = static_cast<size_t>(pads[3]),
.bottom = static_cast<size_t>(pads[1]),
.left = static_cast<size_t>(pads[2])};
const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]),
.height = static_cast<size_t>(stride[0])};
const auto status = nnp_max_pooling_output(
batch_size,
input_channels,
input_size,
padding,
pooling_size,
pooling_stride,
X.template data<float>(),
Y->template mutable_data<float>(),
nnpack_threadpool());
CAFFE_ENFORCE(nnp_status_success == status, "");
return true;
}
private:
};
class NNPACKReluOp final : public Operator<CPUContext> {
public:
NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDevice() override {
auto& X = Input(0);
auto* Y = Output(0);
const auto status = nnp_relu_output(
1,
X.numel(),
X.template data<float>(),
Y->template mutable_data<float>(),
0.0,
nnpack_threadpool());
CAFFE_ENFORCE(nnp_status_success == status, "");
return true;
}
private:
};
class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> {
public:
NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
: LeakyReluOp<float, CPUContext>(operator_def, ws) {
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDevice() override {
auto& X = Input(0);
auto* Y = Output(0);
const auto status = nnp_relu_output(
1,
X.numel(),
X.template data<float>(),
Y->template mutable_data<float>(),
alpha_,
nnpack_threadpool());
CAFFE_ENFORCE(nnp_status_success == status, "");
return true;
}
private:
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp);
} // namespace caffe2

View File

@ -1,237 +0,0 @@
import unittest
import hypothesis.strategies as st
from hypothesis import given, assume, settings
import numpy as np
import time
import os
from caffe2.python import core, dyndep
import caffe2.python.hypothesis_test_util as hu
dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nnpack:nnpack_ops")
np.random.seed(1)
def benchmark(ws, net, warmups=5, iters=100):
for _ in range(warmups):
ws.run(net)
plan = core.Plan("plan")
plan.AddStep(core.ExecutionStep("test-step", net, iters))
before = time.time()
ws.run(plan)
after = time.time()
print("Timing network, time taken per-iteration: {:.6f}ms".format((
after - before) / float(iters) * 1000.0))
return after - before
def has_avx2():
import subprocess
try:
subprocess.check_output(["grep", "avx2", "/proc/cpuinfo"])
return True
except subprocess.CalledProcessError:
# grep exits with rc 1 on no matches
return False
@unittest.skipIf(not has_avx2(), "NNPACK requires AVX2")
class NNPackOpsTest(hu.HypothesisTestCase):
@given(stride=st.integers(1, 3),
pad=st.integers(0, 2),
kernel=st.integers(3, 5),
size=st.integers(5, 10),
input_channels=st.integers(1, 8),
batch_size=st.integers(1, 5),
groups=st.integers(1, 2))
def test_convolution_correctness(self, stride, pad, kernel, size,
input_channels,
batch_size, groups):
input_channels *= groups
output_channels = int(input_channels / groups)
assume(input_channels % groups == 0)
assume(output_channels % groups == 0)
assume(output_channels == input_channels / groups)
assume(stride <= kernel)
if stride != 1:
assume(batch_size == 1)
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
w = np.random.rand(
input_channels, output_channels, kernel, kernel).astype(np.float32)\
- 0.5
b = np.random.rand(output_channels).astype(np.float32) - 0.5
order = "NCHW"
outputs = {}
for engine in ["", "NNPACK"]:
op = core.CreateOperator(
"Conv",
["X", "w", "b"],
["Y"],
stride=stride,
kernel=kernel,
pad=pad,
order=order,
kts="TUPLE",
engine=engine,
group=groups,
)
self.ws.create_blob("X").feed(X)
self.ws.create_blob("w").feed(w)
self.ws.create_blob("b").feed(b)
self.ws.run(op)
outputs[engine] = self.ws.blobs["Y"].fetch()
np.testing.assert_allclose(
outputs[""],
outputs["NNPACK"],
atol=1e-4,
rtol=1e-4)
@given(size=st.sampled_from([6, 8]),
input_channels=st.integers(1, 8),
batch_size=st.integers(1, 5))
def test_max_pool_correctness(self, size, input_channels, batch_size):
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
order = "NCHW"
outputs = {}
# only 2 * 2 stride and 2 * 2 pool is supported in NNPack now
stride = 2
kernel = 2
# The pooling strategy of NNPack is different from caffe2 pooling
pad = 0
for engine in ["", "NNPACK"]:
op = core.CreateOperator(
"MaxPool",
["X"],
["Y"],
stride=stride,
kernel=kernel,
pad=pad,
order=order,
engine=engine,
)
self.ws.create_blob("X").feed(X)
self.ws.run(op)
outputs[engine] = self.ws.blobs["Y"].fetch()
np.testing.assert_allclose(
outputs[""],
outputs["NNPACK"],
atol=1e-4,
rtol=1e-4)
@given(size=st.sampled_from([6, 8]),
input_channels=st.integers(1, 8),
batch_size=st.integers(1, 5))
def test_relu_correctness(self, size, input_channels, batch_size):
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
outputs = {}
for engine in ["", "NNPACK"]:
op = core.CreateOperator(
"Relu",
["X"],
["Y"],
engine=engine,
)
self.ws.create_blob("X").feed(X)
self.ws.run(op)
outputs[engine] = self.ws.blobs["Y"].fetch()
np.testing.assert_allclose(
outputs[""],
outputs["NNPACK"],
atol=1e-4,
rtol=1e-4)
@given(size=st.sampled_from([6, 8]),
input_channels=st.integers(1, 8),
batch_size=st.integers(1, 5),
alpha=st.floats(0, 1))
def test_leaky_relu_correctness(self, size, input_channels, batch_size,
alpha):
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
outputs = {}
for engine in ["", "NNPACK"]:
op = core.CreateOperator(
"LeakyRelu",
["X"],
["Y"],
alpha=alpha,
engine=engine,
)
self.ws.create_blob("X").feed(X)
self.ws.run(op)
outputs[engine] = self.ws.blobs["Y"].fetch()
np.testing.assert_allclose(
outputs[""],
outputs["NNPACK"],
atol=1e-4,
rtol=1e-4)
@settings(deadline=3600)
@unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
@given(stride=st.integers(1, 1),
pad=st.integers(0, 2),
kernel=st.sampled_from([3, 5, 7]),
size=st.integers(30, 90),
input_channels=st.sampled_from([3, 64, 256]),
output_channels=st.sampled_from([32, 96, 256]),
batch_size=st.sampled_from([32, 64, 96, 128]))
def test_timings(self, stride, pad, kernel, size,
input_channels, output_channels, batch_size):
assume(stride <= kernel)
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
w = np.random.rand(output_channels, input_channels,
kernel, kernel).astype(np.float32) - 0.5
b = np.random.rand(output_channels).astype(np.float32) - 0.5
order = "NCHW"
times = {}
for engine in ["", "NNPACK"]:
net = core.Net(engine + "_test")
net.Conv(
["X", "W", "b"], "Y",
order=order,
kernel=kernel,
stride=stride,
pad=pad,
kts="TUPLE",
engine=engine,
)
self.ws.create_blob("X").feed(X)
self.ws.create_blob("W").feed(w)
self.ws.create_blob("b").feed(b)
self.ws.run(net)
times[engine] = benchmark(self.ws, net)
print("Speedup for NNPACK: {:.2f}".format(
times[""] / times["NNPACK"]))
@settings(deadline=3600)
@unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
@given(size=st.integers(30, 90),
input_channels=st.sampled_from([3, 64, 256]),
batch_size=st.sampled_from([32, 64, 96, 128]))
def test_relu_timings(self, size, input_channels, batch_size):
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
times = {}
for engine in ["", "NNPACK"]:
net = core.Net(engine + "_test")
net.Relu(
["X"],
["Y"],
engine=engine,
)
self.ws.create_blob("X").feed(X)
self.ws.run(net)
times[engine] = benchmark(self.ws, net)
print("Speedup for NNPACK: {:.2f}".format(
times[""] / times["NNPACK"]))

Some files were not shown because too many files have changed in this diff Show More