Remove NervanaGPU operators from Caffe2 (#12564)

Summary:
Fix #12540
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12564

Reviewed By: orionr

Differential Revision: D10379775

Pulled By: soumith

fbshipit-source-id: a925b116f2687e56bf54465fc02ca2eb1e7c8eb0
This commit is contained in:
mratsim
2018-10-15 11:02:31 -07:00
committed by Facebook Github Bot
parent 151b28521a
commit a1bbe80e21
11 changed files with 0 additions and 787 deletions

3
.gitmodules vendored
View File

@ -13,9 +13,6 @@
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest.git
[submodule "third_party/nervanagpu"]
path = third_party/nervanagpu
url = https://github.com/NervanaSystems/nervanagpu.git
[submodule "third_party/benchmark"]
path = third_party/benchmark
url = https://github.com/google/benchmark.git

View File

@ -95,7 +95,6 @@ option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
option(USE_NCCL "Use NCCL" ON)
option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF)
option(USE_NNAPI "Use NNAPI" OFF)
option(USE_NNPACK "Use NNPACK" ON)
option(USE_NUMA "Use NUMA (only available on Linux)" ON)

View File

@ -1,12 +0,0 @@
if(USE_NERVANA_GPU)
message(STATUS "Include Nervana operators")
set(Caffe2_CONTRIB_NCCL_GPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_c_api.cu"
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_fc_op_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_init_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_math_gpu.cc"
)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
endif()

View File

@ -1,34 +0,0 @@
#ifndef CAFFE2_FB_NERVANA_INIT_H_
#define CAFFE2_FB_NERVANA_INIT_H_
#include "caffe2/core/init.h"
#include "caffe2/core/flags.h"
#include "nervana_c_api.h"
/**
* A flag that specifies the nervana cubin path.
*/
C10_DECLARE_string(nervana_cubin_path);
namespace caffe2 {
/**
* An empty class to be used in identifying the engine in the math functions.
*/
class NervanaEngine {};
/**
* Returns whether the nervana kernels are loaded or not.
*/
bool NervanaKernelLoaded();
/**
* An initialization function that is run once by caffe2::GlobalInit()
* that initializes the nervana kernels.
*/
bool Caffe2InitializeNervanaKernels();
} // namespace caffe2
#endif // CAFFE2_FB_NERVANA_INIT_H_

View File

@ -1,419 +0,0 @@
/*
* Copyright 2015 Baidu USA, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vector>
#include <string>
#include <map>
#include <cuda.h>
#include <iostream>
#include <sstream>
#include <mutex>
#include <tuple>
#include "nervana_c_api.h"
std::map<CUdevice, int> nervana_sm_counts_;
std::map<std::string, CUfunction> nervana_kernels_;
std::vector<CUmodule> nervana_modules_;
//for when we need to modify the above data structures
std::mutex nervana_load_kernels_mutex_;
std::mutex nervana_sm_count_mutex_;
extern "C" bool nervana_loadKernels(const char* const base_path_cstr) {
std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
//better would be a vector<string>, but there is a bug in nvcc that prevents this
// (bug report filed) (fixed in 7.5)
std::string names[36] = {
"hgemm_nn_vec_128x128",
"hgemm_nn_128x128",
"hgemm_nt_vec_128x128",
"hgemm_nt_128x128",
"hgemm_tn_vec_128x128",
"hgemm_tn_128x128",
"hgemm_nn_vec_128x64",
"hgemm_nn_128x64",
"hgemm_tn_vec_128x64",
"hgemm_tn_128x64",
"hgemm_nn_vec_128x32",
"hgemm_nn_128x32",
"hgemm_tn_vec_128x32",
"hgemm_tn_128x32",
"hgemm_nn_32x128",
"hgemm_nn_vec_32x128",
"hgemm_nt_32x128",
"hgemm_nt_vec_32x128",
"sgemm_nn_vec_128x128",
"sgemm_nn_128x128",
"sgemm_nt_vec_128x128",
"sgemm_nt_128x128",
"sgemm_tn_vec_128x128",
"sgemm_tn_128x128",
"sgemm_nn_vec_128x64",
"sgemm_nn_128x64",
"sgemm_tn_vec_128x64",
"sgemm_tn_128x64",
"sgemm_nn_vec_128x32",
"sgemm_nn_128x32",
"sgemm_tn_vec_128x32",
"sgemm_tn_128x32",
"sgemm_nn_32x128",
"sgemm_nn_vec_32x128",
"sgemm_nt_32x128",
"sgemm_nt_vec_32x128"
};
std::string base_path(base_path_cstr);
for (auto kernel : names) {
if (nervana_kernels_.count(kernel) > 0)
continue;
CUmodule module;
std::string path = base_path + kernel + std::string(".cubin");
CUresult res = cuModuleLoad(&module, path.c_str());
if (res != CUDA_SUCCESS) {
// std::cerr << "Failed to load: " << kernel << " " << res << std::endl;
return false;
}
nervana_modules_.push_back(module);
CUfunction function;
res = cuModuleGetFunction(&function, module, kernel.c_str());
if (res != CUDA_SUCCESS) {
// std::cerr << "Failed to extract: " << kernel << " " << res << std::endl;
return false;
}
nervana_kernels_.insert(std::make_pair(kernel, function));
}
return true;
}
extern "C" bool nervana_unloadKernels() {
std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
while(nervana_modules_.size() > 0) {
auto module = nervana_modules_.back();
CUresult res = cuModuleUnload(module);
nervana_modules_.pop_back();
if (res != CUDA_SUCCESS)
return false;
}
nervana_kernels_.clear();
return true;
}
extern "C" size_t nervana_randStateSizeBytes() {
return 2048 * 32 * sizeof(int);
}
std::tuple<int, int, int> get_grid_dimensions(int grid, int m, int n, int sm_count, const std::string& trans)
{
int sizeA, sizeB, threads;
if (grid >= 0) {
if (grid == 0) {
sizeA = 32;
sizeB = 128;
threads = 128;
} else if (grid == 1) {
sizeA = 128;
sizeB = 32;
threads = 128;
} else if (grid == 2) {
sizeA = 128;
sizeB = 64;
threads = 128;
} else if (grid == 3) {
sizeA = 128;
sizeB = 128;
threads = 256;
}
} else {
int sh = min(m, n);
int size;
if (sh < 384 - 16) {
int sh128 = sh % 128;
if (sh128 > 0 && sh128 < 112) {
if (sh128 > 48 && sh128 <= 64) {
int sh64 = sh / 64;
int wide = max(m, n);
sh64 *= (wide / 128 + (wide % 128 != 0)) / sm_count;
if (sh64 > 1) {
size = 64;
}
else {
size = 32;
}
}
else {
size = 32;
}
}
else {
size = 128;
}
} else {
size = 128;
}
if (m >= n) {
if (trans == "nt") {
size = 128;
}
sizeA = 128;
sizeB = size;
} else {
if (trans == "tn") {
size = 128;
} else if (size == 64) {
//temporary until kernels exist
size = 32;
}
sizeA = size;
sizeB = 128;
}
threads = (sizeA == 128 && sizeB == 128) ? 256 : 128;
}
return std::make_tuple(sizeA, sizeB, threads);
}
extern "C" bool nervana_sgemm(float *A, float *B, float *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid
)
{
int sm_count;
{
std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
CUdevice device;
CUresult res = cuCtxGetDevice(&device);
if (res != CUDA_SUCCESS) {
return false;
}
auto count = nervana_sm_counts_.find(device);
if (count != nervana_sm_counts_.end()) {
sm_count = count->second;
}
else {
int pi;
res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
if (res != CUDA_SUCCESS) {
return false;
}
sm_count = pi;
nervana_sm_counts_[device] = pi;
}
}
std::string name = "sgemm_";
std::string trans;
trans += a_t ? 't' : 'n';
trans += b_t ? 't' : 'n';
name += trans;
int sizeA, sizeB, threads;
std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
if ( (trans == "tn" && m % 4 == 0 && n % 4 == 0) ||
(trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
(trans == "nt" && k % k_vec == 0)) {
name += "_vec";
}
int gridA = m / sizeA + (m % sizeA != 0);
int gridB = n / sizeB + (n % sizeB != 0);
std::stringstream ss;
ss << "_" << sizeA << "x" << sizeB;
name += ss.str();
int flags = 0;
flags |= (stochastic_round << 0);
flags |= (apply_relu << 1);
CUresult res;
if (a_t)
lda *= (8 * sizeof(float));
if (!b_t)
ldb *= (8 * sizeof(float));
int zero = 0;
void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
&zero, &zero, &zero, &zero};
res = cuLaunchKernel(nervana_kernels_[name],
1, gridA, gridB,
threads, 1, 1,
0,
stream, args, NULL);
if (res != CUDA_SUCCESS) {
std::cerr << "Error launching kernel " << name << " " << res << std::endl;
return false;
}
return true;
}
extern "C" bool nervana_hgemm(short *A, short *B, short *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid
)
{
int sm_count;
{
std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
CUdevice device;
CUresult res = cuCtxGetDevice(&device);
if (res != CUDA_SUCCESS) {
return false;
}
auto count = nervana_sm_counts_.find(device);
if (count != nervana_sm_counts_.end()) {
sm_count = count->second;
}
else {
int pi;
res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
if (res != CUDA_SUCCESS) {
return false;
}
sm_count = pi;
nervana_sm_counts_[device] = pi;
}
}
std::string name = "hgemm_";
std::string trans;
trans += a_t ? 't' : 'n';
trans += b_t ? 't' : 'n';
name += trans;
int sizeA, sizeB, threads;
std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
if ( (trans == "tn" && m % 4 == 0 && n % 4 == 0) ||
(trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
(trans == "nt" && k % k_vec == 0)) {
name += "_vec";
}
int gridA = m / sizeA + (m % sizeA != 0);
int gridB = n / sizeB + (n % sizeB != 0);
std::stringstream ss;
ss << "_" << sizeA << "x" << sizeB;
name += ss.str();
int flags = 0;
flags |= (stochastic_round << 0);
flags |= (apply_relu << 1);
CUresult res;
if (a_t)
lda *= (8 * sizeof(short));
if (!b_t)
ldb *= (8 * sizeof(short));
int zero = 0;
void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
&zero, &zero, &zero, &zero};
res = cuLaunchKernel(nervana_kernels_[name],
1, gridA, gridB,
threads, 1, 1,
0,
stream, args, NULL);
if (res != CUDA_SUCCESS) {
std::cerr << "Error launching kernel " << name << " " << res << std::endl;
return false;
}
return true;
}
extern "C" bool nervana_sgemm_colmajor(float *A, float *B, float *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid
)
{
return nervana_sgemm(B, A, C,
b_t, a_t,
n, m, k,
ldb, lda, ldc,
alpha, beta,
rand_state, stochastic_round, apply_relu,
stream, grid);
}
extern "C" bool nervana_hgemm_colmajor(short *A, short *B, short *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid
)
{
return nervana_hgemm(B, A, C,
b_t, a_t,
n, m, k,
ldb, lda, ldc,
alpha, beta,
rand_state, stochastic_round, apply_relu,
stream, grid);
}

View File

@ -1,132 +0,0 @@
/*
* Copyright 2015 Baidu USA, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda.h>
#ifdef __cplusplus
extern "C" {
#else
#include <stdbool.h>
#endif
/** Load all the sgemm and hgemm cubins from the given path
* \param [in] base_path path to the kernel cubins
* \return true on success and false if an error was encountered
*/
bool nervana_loadKernels(const char* const base_path);
/** Unload all currently loaded cubins
* \return true on success and false if an error was encountered
*/
bool nervana_unloadKernels();
/** Return the number of bytes required for the random state
* used in stochastic rounding.
* \return bytes required for random state
*/
size_t nervana_randStateSizeBytes();
/** Perform BLAS sgemm on alpha * A * B + beta * C, with the
* additional options of stochastic rounding and applying a
* rectified linear unit (relu) to the result. This routine expects
* all matrices to be in row-major order.
* \param [in] A Pointer to the data for matrix A
* \param [in] B Pointer to the data for matrix B
* \param [in, out] C Pointer to the data for matrix C
* \param [in] m number of rows of C
* \param [in] n number of columns of C
* \param [in] k inner dimension of multiplication
* \param [in] lda leading dimension of two-dimensional array A
* \param [in] ldb leading dimension of two-dimensional array B
* \param [in] ldc leading dimension of two-dimensional array C
* \param [in] alpha scalar used for multiplication
* \param [in] beta scalar used for multiplication
* \param [in, out] rand_state pointer to memory used for random state
* use nervana_randStateSizeBytes to allocate the correct size
* if stochastic_round is false, this can be NULL
* \param [in] stochastic_round true if stochastic rounding should be used
* \param [in] apply_relu true if a relu should be applied to the result
* \param [in] stream The cudaStream on which the kernel should be launched
* \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
*/
bool nervana_sgemm(float *A, float *B, float *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid=-1
);
/** Perform BLAS hgemm on alpha * A * B + beta * C, with the
* additional options of stochastic rounding and applying a
* rectified linear unit (relu) to the result. This routine expects
* all matrices to be in row-major order.
* \param [in] A Pointer to the data for matrix A
* \param [in] B Pointer to the data for matrix B
* \param [in, out] C Pointer to the data for matrix C
* \param [in] m number of rows of C
* \param [in] n number of columns of C
* \param [in] k inner dimension of multiplication
* \param [in] lda leading dimension of two-dimensional array A
* \param [in] ldb leading dimension of two-dimensional array B
* \param [in] ldc leading dimension of two-dimensional array C
* \param [in] alpha scalar used for multiplication
* \param [in] beta scalar used for multiplication
* \param [in, out] rand_state pointer to memory used for random state
* use nervana_randStateSizeBytes to allocate the correct size
* if stochastic_round is false, this can be NULL
* \param [in] stochastic_round true if stochastic rounding should be used
* \param [in] apply_relu true if a relu should be applied to the result
* \param [in] stream The cudaStream on which the kernel should be launched
* \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
*/
bool nervana_hgemm(short *A, short *B, short *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid=-1
);
bool nervana_sgemm_colmajor(float *A, float *B, float *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid=-1
);
bool nervana_hgemm_colmajor(short *A, short *B, short *C,
bool a_t, bool b_t,
int m, int n, int k,
int lda, int ldb, int ldc,
float alpha, float beta,
unsigned int *rand_state,
bool stochastic_round, bool apply_relu,
CUstream stream, int grid=-1
);
#ifdef __cplusplus
}
#endif

View File

@ -1,15 +0,0 @@
#include "nervana.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/fully_connected_op.h"
namespace caffe2 {
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
FC,
NERVANA,
FullyConnectedOp<CUDAContext, NervanaEngine>);
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
FCGradient,
NERVANA,
FullyConnectedGradientOp<CUDAContext, NervanaEngine>);
} // namespace caffe2

View File

@ -1,66 +0,0 @@
#include "nervana.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/flags.h"
#include "caffe2/core/blob.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/workspace.h"
#include "caffe2/operators/fully_connected_op.h"
#include "caffe2/utils/math.h"
#include "common/gtest/gtest_extensions.h"
#include <gtest/gtest.h>
C10_DECLARE_string(caffe_test_root);
namespace caffe2 {
namespace {
static void AddConstInput(const std::vector<int>& shape, const float value,
const string& name, Workspace* ws) {
DeviceOption option;
option.set_device_type(PROTO_CUDA);
CUDAContext context(option);
Blob* blob = ws->CreateBlob(name);
auto* tensor = BlobGetMutableTensor(blob, CUDA);
tensor->Resize(shape);
math::Set<float, CUDAContext>(tensor->size(), value,
tensor->mutable_data<float>(),
&context);
return;
}
} // namespace
TEST(NervanaFullyConnectedTest, Test) {
if (!NervanaKernelLoaded()) {
SKIP() << "Nervana kernels are not loaded. Skipping test.";
}
Workspace ws;
OperatorDef def;
def.set_name("test");
def.set_type("FC");
def.add_input("X");
def.add_input("W");
def.add_input("B");
def.add_output("Y");
def.mutable_device_option()->set_device_type(PROTO_CUDA);
def.set_engine("NERVANA");
AddConstInput(std::vector<int>{5, 10}, 1., "X", &ws);
AddConstInput(std::vector<int>{6, 10}, 1., "W", &ws);
AddConstInput(std::vector<int>{6}, 0.1, "B", &ws);
unique_ptr<OperatorBase> op(
new FullyConnectedOp<CUDAContext, NervanaEngine>(def, &ws));
EXPECT_NE(nullptr, op.get());
EXPECT_TRUE(op->Run());
Blob* Yblob = ws.GetBlob("Y");
EXPECT_NE(nullptr, Yblob);
auto& Y = Yblob->Get<Tensor>();
Tensor Y_cpu(Y, CPU);
EXPECT_EQ(Y.size(), 5 * 6);
for (int i = 0; i < Y.size(); ++i) {
CHECK_LT(Y_cpu.data<float>()[i], 10.11);
CHECK_GT(Y_cpu.data<float>()[i], 10.09);
}
}
} // namespace caffe2

View File

@ -1,48 +0,0 @@
#include "caffe2/core/init.h"
#include "caffe2/core/flags.h"
#include "nervana_c_api.h"
C10_DEFINE_string(
nervana_cubin_path,
"/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/cubin/",
"The cubin path for nervana kernels. Currently defaulted "
"to the internal fb deployment path.");
namespace caffe2 {
namespace {
static bool g_nervana_kernel_loaded = false;
} // namespace
bool NervanaKernelLoaded() { return g_nervana_kernel_loaded; }
bool Caffe2InitializeNervanaKernels(int*, char***) {
// If we do not specify the nervana cubin path, we will simply return.
if (FLAGS_nervana_cubin_path.size() == 0) {
VLOG(1) << "Nervana cubin loading skipped.";
return true;
}
g_nervana_kernel_loaded =
nervana_loadKernels(FLAGS_nervana_cubin_path.c_str());
if (g_nervana_kernel_loaded) {
VLOG(1) << "Loaded nervana kernels from path "
<< FLAGS_nervana_cubin_path;
} else {
// Since this is not a critical error we will just vlog it.
VLOG(1) << "Cannot load nervana gpu kernels from path "
<< FLAGS_nervana_cubin_path
<< ", will disable Caffe2 nervana engines.";
}
// We will always return true for this initialization, because the loading
// result is kept and accessible via NervanaKernelLoaded(). This allows us
// to register an init function but not forcing the user to have to install
// nervana kernels, delaying the failure to the first time a nervana kernel
// is actually called.
return true;
}
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2InitializeNervanaKernels,
&Caffe2InitializeNervanaKernels,
"Initialize nervana kernels for caffe2.");
} // namespace caffe2

View File

@ -1,53 +0,0 @@
#include "nervana.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace math {
// Caffe2 gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
template <>
void Gemm<float, CUDAContext, NervanaEngine>(
const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB,
const int M,
const int N,
const int K,
const float alpha,
const float* A,
const float* B,
const float beta,
float* C,
CUDAContext* context,
TensorProto::DataType /*math_type*/) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (TransA == CblasNoTrans) ? K : M;
int ldb = (TransB == CblasNoTrans) ? N : K;
bool a_t = (TransA == CblasTrans);
bool b_t = (TransB == CblasTrans);
CAFFE_ENFORCE(nervana_sgemm(
const_cast<float*>(A),
const_cast<float*>(B),
C,
a_t,
b_t,
M,
N,
K,
lda,
ldb,
N,
alpha,
beta,
nullptr,
false,
false,
context->cuda_stream()));
}
} // namespace math
} // namespace caffe2

View File

@ -105,10 +105,6 @@ function (caffe2_print_configuration_summary)
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
endif()
message(STATUS " USE_NERVANA_GPU : ${USE_NERVANA_GPU}")
if(${USE_NERVANA_GPU})
message(STATUS " NERVANA_GPU version : ${NERVANA_GPU_VERSION}")
endif()
message(STATUS " USE_NNPACK : ${USE_NNPACK}")
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}")