mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Remove NervanaGPU operators from Caffe2 (#12564)
Summary: Fix #12540 Pull Request resolved: https://github.com/pytorch/pytorch/pull/12564 Reviewed By: orionr Differential Revision: D10379775 Pulled By: soumith fbshipit-source-id: a925b116f2687e56bf54465fc02ca2eb1e7c8eb0
This commit is contained in:
committed by
Facebook Github Bot
parent
151b28521a
commit
a1bbe80e21
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -13,9 +13,6 @@
|
||||
[submodule "third_party/googletest"]
|
||||
path = third_party/googletest
|
||||
url = https://github.com/google/googletest.git
|
||||
[submodule "third_party/nervanagpu"]
|
||||
path = third_party/nervanagpu
|
||||
url = https://github.com/NervanaSystems/nervanagpu.git
|
||||
[submodule "third_party/benchmark"]
|
||||
path = third_party/benchmark
|
||||
url = https://github.com/google/benchmark.git
|
||||
|
@ -95,7 +95,6 @@ option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
option(USE_NCCL "Use NCCL" ON)
|
||||
option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
|
||||
option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF)
|
||||
option(USE_NNAPI "Use NNAPI" OFF)
|
||||
option(USE_NNPACK "Use NNPACK" ON)
|
||||
option(USE_NUMA "Use NUMA (only available on Linux)" ON)
|
||||
|
@ -1,12 +0,0 @@
|
||||
if(USE_NERVANA_GPU)
|
||||
message(STATUS "Include Nervana operators")
|
||||
set(Caffe2_CONTRIB_NCCL_GPU_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_c_api.cu"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_fc_op_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_init_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/nervana_math_gpu.cc"
|
||||
)
|
||||
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
endif()
|
@ -1,34 +0,0 @@
|
||||
#ifndef CAFFE2_FB_NERVANA_INIT_H_
|
||||
#define CAFFE2_FB_NERVANA_INIT_H_
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/flags.h"
|
||||
|
||||
#include "nervana_c_api.h"
|
||||
|
||||
/**
|
||||
* A flag that specifies the nervana cubin path.
|
||||
*/
|
||||
C10_DECLARE_string(nervana_cubin_path);
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
/**
|
||||
* An empty class to be used in identifying the engine in the math functions.
|
||||
*/
|
||||
class NervanaEngine {};
|
||||
|
||||
/**
|
||||
* Returns whether the nervana kernels are loaded or not.
|
||||
*/
|
||||
bool NervanaKernelLoaded();
|
||||
|
||||
/**
|
||||
* An initialization function that is run once by caffe2::GlobalInit()
|
||||
* that initializes the nervana kernels.
|
||||
*/
|
||||
bool Caffe2InitializeNervanaKernels();
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_FB_NERVANA_INIT_H_
|
@ -1,419 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Baidu USA, Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <cuda.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <mutex>
|
||||
#include <tuple>
|
||||
#include "nervana_c_api.h"
|
||||
|
||||
std::map<CUdevice, int> nervana_sm_counts_;
|
||||
std::map<std::string, CUfunction> nervana_kernels_;
|
||||
std::vector<CUmodule> nervana_modules_;
|
||||
|
||||
//for when we need to modify the above data structures
|
||||
std::mutex nervana_load_kernels_mutex_;
|
||||
std::mutex nervana_sm_count_mutex_;
|
||||
|
||||
extern "C" bool nervana_loadKernels(const char* const base_path_cstr) {
|
||||
std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
|
||||
|
||||
//better would be a vector<string>, but there is a bug in nvcc that prevents this
|
||||
// (bug report filed) (fixed in 7.5)
|
||||
std::string names[36] = {
|
||||
"hgemm_nn_vec_128x128",
|
||||
"hgemm_nn_128x128",
|
||||
"hgemm_nt_vec_128x128",
|
||||
"hgemm_nt_128x128",
|
||||
"hgemm_tn_vec_128x128",
|
||||
"hgemm_tn_128x128",
|
||||
"hgemm_nn_vec_128x64",
|
||||
"hgemm_nn_128x64",
|
||||
"hgemm_tn_vec_128x64",
|
||||
"hgemm_tn_128x64",
|
||||
"hgemm_nn_vec_128x32",
|
||||
"hgemm_nn_128x32",
|
||||
"hgemm_tn_vec_128x32",
|
||||
"hgemm_tn_128x32",
|
||||
"hgemm_nn_32x128",
|
||||
"hgemm_nn_vec_32x128",
|
||||
"hgemm_nt_32x128",
|
||||
"hgemm_nt_vec_32x128",
|
||||
"sgemm_nn_vec_128x128",
|
||||
"sgemm_nn_128x128",
|
||||
"sgemm_nt_vec_128x128",
|
||||
"sgemm_nt_128x128",
|
||||
"sgemm_tn_vec_128x128",
|
||||
"sgemm_tn_128x128",
|
||||
"sgemm_nn_vec_128x64",
|
||||
"sgemm_nn_128x64",
|
||||
"sgemm_tn_vec_128x64",
|
||||
"sgemm_tn_128x64",
|
||||
"sgemm_nn_vec_128x32",
|
||||
"sgemm_nn_128x32",
|
||||
"sgemm_tn_vec_128x32",
|
||||
"sgemm_tn_128x32",
|
||||
"sgemm_nn_32x128",
|
||||
"sgemm_nn_vec_32x128",
|
||||
"sgemm_nt_32x128",
|
||||
"sgemm_nt_vec_32x128"
|
||||
};
|
||||
|
||||
std::string base_path(base_path_cstr);
|
||||
|
||||
for (auto kernel : names) {
|
||||
if (nervana_kernels_.count(kernel) > 0)
|
||||
continue;
|
||||
|
||||
CUmodule module;
|
||||
|
||||
std::string path = base_path + kernel + std::string(".cubin");
|
||||
CUresult res = cuModuleLoad(&module, path.c_str());
|
||||
|
||||
if (res != CUDA_SUCCESS) {
|
||||
// std::cerr << "Failed to load: " << kernel << " " << res << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
nervana_modules_.push_back(module);
|
||||
|
||||
CUfunction function;
|
||||
res = cuModuleGetFunction(&function, module, kernel.c_str());
|
||||
if (res != CUDA_SUCCESS) {
|
||||
// std::cerr << "Failed to extract: " << kernel << " " << res << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
nervana_kernels_.insert(std::make_pair(kernel, function));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" bool nervana_unloadKernels() {
|
||||
std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
|
||||
while(nervana_modules_.size() > 0) {
|
||||
auto module = nervana_modules_.back();
|
||||
CUresult res = cuModuleUnload(module);
|
||||
|
||||
nervana_modules_.pop_back();
|
||||
|
||||
if (res != CUDA_SUCCESS)
|
||||
return false;
|
||||
}
|
||||
|
||||
nervana_kernels_.clear();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" size_t nervana_randStateSizeBytes() {
|
||||
return 2048 * 32 * sizeof(int);
|
||||
}
|
||||
|
||||
std::tuple<int, int, int> get_grid_dimensions(int grid, int m, int n, int sm_count, const std::string& trans)
|
||||
{
|
||||
int sizeA, sizeB, threads;
|
||||
if (grid >= 0) {
|
||||
if (grid == 0) {
|
||||
sizeA = 32;
|
||||
sizeB = 128;
|
||||
threads = 128;
|
||||
} else if (grid == 1) {
|
||||
sizeA = 128;
|
||||
sizeB = 32;
|
||||
threads = 128;
|
||||
} else if (grid == 2) {
|
||||
sizeA = 128;
|
||||
sizeB = 64;
|
||||
threads = 128;
|
||||
} else if (grid == 3) {
|
||||
sizeA = 128;
|
||||
sizeB = 128;
|
||||
threads = 256;
|
||||
}
|
||||
} else {
|
||||
int sh = min(m, n);
|
||||
|
||||
int size;
|
||||
if (sh < 384 - 16) {
|
||||
int sh128 = sh % 128;
|
||||
if (sh128 > 0 && sh128 < 112) {
|
||||
if (sh128 > 48 && sh128 <= 64) {
|
||||
int sh64 = sh / 64;
|
||||
int wide = max(m, n);
|
||||
sh64 *= (wide / 128 + (wide % 128 != 0)) / sm_count;
|
||||
if (sh64 > 1) {
|
||||
size = 64;
|
||||
}
|
||||
else {
|
||||
size = 32;
|
||||
}
|
||||
}
|
||||
else {
|
||||
size = 32;
|
||||
}
|
||||
}
|
||||
else {
|
||||
size = 128;
|
||||
}
|
||||
} else {
|
||||
size = 128;
|
||||
}
|
||||
|
||||
if (m >= n) {
|
||||
if (trans == "nt") {
|
||||
size = 128;
|
||||
}
|
||||
sizeA = 128;
|
||||
sizeB = size;
|
||||
} else {
|
||||
if (trans == "tn") {
|
||||
size = 128;
|
||||
} else if (size == 64) {
|
||||
//temporary until kernels exist
|
||||
size = 32;
|
||||
}
|
||||
sizeA = size;
|
||||
sizeB = 128;
|
||||
}
|
||||
threads = (sizeA == 128 && sizeB == 128) ? 256 : 128;
|
||||
}
|
||||
|
||||
return std::make_tuple(sizeA, sizeB, threads);
|
||||
}
|
||||
|
||||
extern "C" bool nervana_sgemm(float *A, float *B, float *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid
|
||||
)
|
||||
{
|
||||
int sm_count;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
|
||||
|
||||
CUdevice device;
|
||||
CUresult res = cuCtxGetDevice(&device);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
auto count = nervana_sm_counts_.find(device);
|
||||
if (count != nervana_sm_counts_.end()) {
|
||||
sm_count = count->second;
|
||||
}
|
||||
else {
|
||||
int pi;
|
||||
res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
sm_count = pi;
|
||||
nervana_sm_counts_[device] = pi;
|
||||
}
|
||||
}
|
||||
|
||||
std::string name = "sgemm_";
|
||||
|
||||
std::string trans;
|
||||
trans += a_t ? 't' : 'n';
|
||||
trans += b_t ? 't' : 'n';
|
||||
|
||||
name += trans;
|
||||
|
||||
int sizeA, sizeB, threads;
|
||||
|
||||
std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
|
||||
|
||||
int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
|
||||
|
||||
if ( (trans == "tn" && m % 4 == 0 && n % 4 == 0) ||
|
||||
(trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
|
||||
(trans == "nt" && k % k_vec == 0)) {
|
||||
name += "_vec";
|
||||
}
|
||||
|
||||
int gridA = m / sizeA + (m % sizeA != 0);
|
||||
int gridB = n / sizeB + (n % sizeB != 0);
|
||||
std::stringstream ss;
|
||||
ss << "_" << sizeA << "x" << sizeB;
|
||||
name += ss.str();
|
||||
|
||||
int flags = 0;
|
||||
flags |= (stochastic_round << 0);
|
||||
flags |= (apply_relu << 1);
|
||||
|
||||
CUresult res;
|
||||
|
||||
if (a_t)
|
||||
lda *= (8 * sizeof(float));
|
||||
|
||||
if (!b_t)
|
||||
ldb *= (8 * sizeof(float));
|
||||
|
||||
int zero = 0;
|
||||
void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
|
||||
&zero, &zero, &zero, &zero};
|
||||
|
||||
res = cuLaunchKernel(nervana_kernels_[name],
|
||||
1, gridA, gridB,
|
||||
threads, 1, 1,
|
||||
0,
|
||||
stream, args, NULL);
|
||||
|
||||
if (res != CUDA_SUCCESS) {
|
||||
std::cerr << "Error launching kernel " << name << " " << res << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" bool nervana_hgemm(short *A, short *B, short *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid
|
||||
)
|
||||
{
|
||||
int sm_count;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
|
||||
|
||||
CUdevice device;
|
||||
CUresult res = cuCtxGetDevice(&device);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
auto count = nervana_sm_counts_.find(device);
|
||||
if (count != nervana_sm_counts_.end()) {
|
||||
sm_count = count->second;
|
||||
}
|
||||
else {
|
||||
int pi;
|
||||
res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
sm_count = pi;
|
||||
nervana_sm_counts_[device] = pi;
|
||||
}
|
||||
}
|
||||
|
||||
std::string name = "hgemm_";
|
||||
|
||||
std::string trans;
|
||||
trans += a_t ? 't' : 'n';
|
||||
trans += b_t ? 't' : 'n';
|
||||
|
||||
name += trans;
|
||||
|
||||
int sizeA, sizeB, threads;
|
||||
|
||||
std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
|
||||
|
||||
int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
|
||||
|
||||
if ( (trans == "tn" && m % 4 == 0 && n % 4 == 0) ||
|
||||
(trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
|
||||
(trans == "nt" && k % k_vec == 0)) {
|
||||
name += "_vec";
|
||||
}
|
||||
|
||||
int gridA = m / sizeA + (m % sizeA != 0);
|
||||
int gridB = n / sizeB + (n % sizeB != 0);
|
||||
std::stringstream ss;
|
||||
ss << "_" << sizeA << "x" << sizeB;
|
||||
name += ss.str();
|
||||
|
||||
int flags = 0;
|
||||
flags |= (stochastic_round << 0);
|
||||
flags |= (apply_relu << 1);
|
||||
|
||||
CUresult res;
|
||||
|
||||
if (a_t)
|
||||
lda *= (8 * sizeof(short));
|
||||
|
||||
if (!b_t)
|
||||
ldb *= (8 * sizeof(short));
|
||||
|
||||
int zero = 0;
|
||||
void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
|
||||
&zero, &zero, &zero, &zero};
|
||||
|
||||
res = cuLaunchKernel(nervana_kernels_[name],
|
||||
1, gridA, gridB,
|
||||
threads, 1, 1,
|
||||
0,
|
||||
stream, args, NULL);
|
||||
|
||||
if (res != CUDA_SUCCESS) {
|
||||
std::cerr << "Error launching kernel " << name << " " << res << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" bool nervana_sgemm_colmajor(float *A, float *B, float *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid
|
||||
)
|
||||
{
|
||||
return nervana_sgemm(B, A, C,
|
||||
b_t, a_t,
|
||||
n, m, k,
|
||||
ldb, lda, ldc,
|
||||
alpha, beta,
|
||||
rand_state, stochastic_round, apply_relu,
|
||||
stream, grid);
|
||||
}
|
||||
|
||||
extern "C" bool nervana_hgemm_colmajor(short *A, short *B, short *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid
|
||||
)
|
||||
{
|
||||
return nervana_hgemm(B, A, C,
|
||||
b_t, a_t,
|
||||
n, m, k,
|
||||
ldb, lda, ldc,
|
||||
alpha, beta,
|
||||
rand_state, stochastic_round, apply_relu,
|
||||
stream, grid);
|
||||
}
|
@ -1,132 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Baidu USA, Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#else
|
||||
#include <stdbool.h>
|
||||
#endif
|
||||
|
||||
/** Load all the sgemm and hgemm cubins from the given path
|
||||
* \param [in] base_path path to the kernel cubins
|
||||
* \return true on success and false if an error was encountered
|
||||
*/
|
||||
bool nervana_loadKernels(const char* const base_path);
|
||||
|
||||
/** Unload all currently loaded cubins
|
||||
* \return true on success and false if an error was encountered
|
||||
*/
|
||||
bool nervana_unloadKernels();
|
||||
|
||||
/** Return the number of bytes required for the random state
|
||||
* used in stochastic rounding.
|
||||
* \return bytes required for random state
|
||||
*/
|
||||
size_t nervana_randStateSizeBytes();
|
||||
|
||||
/** Perform BLAS sgemm on alpha * A * B + beta * C, with the
|
||||
* additional options of stochastic rounding and applying a
|
||||
* rectified linear unit (relu) to the result. This routine expects
|
||||
* all matrices to be in row-major order.
|
||||
* \param [in] A Pointer to the data for matrix A
|
||||
* \param [in] B Pointer to the data for matrix B
|
||||
* \param [in, out] C Pointer to the data for matrix C
|
||||
* \param [in] m number of rows of C
|
||||
* \param [in] n number of columns of C
|
||||
* \param [in] k inner dimension of multiplication
|
||||
* \param [in] lda leading dimension of two-dimensional array A
|
||||
* \param [in] ldb leading dimension of two-dimensional array B
|
||||
* \param [in] ldc leading dimension of two-dimensional array C
|
||||
* \param [in] alpha scalar used for multiplication
|
||||
* \param [in] beta scalar used for multiplication
|
||||
* \param [in, out] rand_state pointer to memory used for random state
|
||||
* use nervana_randStateSizeBytes to allocate the correct size
|
||||
* if stochastic_round is false, this can be NULL
|
||||
* \param [in] stochastic_round true if stochastic rounding should be used
|
||||
* \param [in] apply_relu true if a relu should be applied to the result
|
||||
* \param [in] stream The cudaStream on which the kernel should be launched
|
||||
* \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
|
||||
*/
|
||||
bool nervana_sgemm(float *A, float *B, float *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid=-1
|
||||
);
|
||||
|
||||
/** Perform BLAS hgemm on alpha * A * B + beta * C, with the
|
||||
* additional options of stochastic rounding and applying a
|
||||
* rectified linear unit (relu) to the result. This routine expects
|
||||
* all matrices to be in row-major order.
|
||||
* \param [in] A Pointer to the data for matrix A
|
||||
* \param [in] B Pointer to the data for matrix B
|
||||
* \param [in, out] C Pointer to the data for matrix C
|
||||
* \param [in] m number of rows of C
|
||||
* \param [in] n number of columns of C
|
||||
* \param [in] k inner dimension of multiplication
|
||||
* \param [in] lda leading dimension of two-dimensional array A
|
||||
* \param [in] ldb leading dimension of two-dimensional array B
|
||||
* \param [in] ldc leading dimension of two-dimensional array C
|
||||
* \param [in] alpha scalar used for multiplication
|
||||
* \param [in] beta scalar used for multiplication
|
||||
* \param [in, out] rand_state pointer to memory used for random state
|
||||
* use nervana_randStateSizeBytes to allocate the correct size
|
||||
* if stochastic_round is false, this can be NULL
|
||||
* \param [in] stochastic_round true if stochastic rounding should be used
|
||||
* \param [in] apply_relu true if a relu should be applied to the result
|
||||
* \param [in] stream The cudaStream on which the kernel should be launched
|
||||
* \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
|
||||
*/
|
||||
bool nervana_hgemm(short *A, short *B, short *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid=-1
|
||||
);
|
||||
|
||||
bool nervana_sgemm_colmajor(float *A, float *B, float *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid=-1
|
||||
);
|
||||
|
||||
bool nervana_hgemm_colmajor(short *A, short *B, short *C,
|
||||
bool a_t, bool b_t,
|
||||
int m, int n, int k,
|
||||
int lda, int ldb, int ldc,
|
||||
float alpha, float beta,
|
||||
unsigned int *rand_state,
|
||||
bool stochastic_round, bool apply_relu,
|
||||
CUstream stream, int grid=-1
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,15 +0,0 @@
|
||||
#include "nervana.h"
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/fully_connected_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
|
||||
FC,
|
||||
NERVANA,
|
||||
FullyConnectedOp<CUDAContext, NervanaEngine>);
|
||||
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
|
||||
FCGradient,
|
||||
NERVANA,
|
||||
FullyConnectedGradientOp<CUDAContext, NervanaEngine>);
|
||||
} // namespace caffe2
|
@ -1,66 +0,0 @@
|
||||
#include "nervana.h"
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/flags.h"
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/operators/fully_connected_op.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "common/gtest/gtest_extensions.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
C10_DECLARE_string(caffe_test_root);
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
static void AddConstInput(const std::vector<int>& shape, const float value,
|
||||
const string& name, Workspace* ws) {
|
||||
DeviceOption option;
|
||||
option.set_device_type(PROTO_CUDA);
|
||||
CUDAContext context(option);
|
||||
Blob* blob = ws->CreateBlob(name);
|
||||
auto* tensor = BlobGetMutableTensor(blob, CUDA);
|
||||
tensor->Resize(shape);
|
||||
math::Set<float, CUDAContext>(tensor->size(), value,
|
||||
tensor->mutable_data<float>(),
|
||||
&context);
|
||||
return;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(NervanaFullyConnectedTest, Test) {
|
||||
if (!NervanaKernelLoaded()) {
|
||||
SKIP() << "Nervana kernels are not loaded. Skipping test.";
|
||||
}
|
||||
Workspace ws;
|
||||
OperatorDef def;
|
||||
def.set_name("test");
|
||||
def.set_type("FC");
|
||||
def.add_input("X");
|
||||
def.add_input("W");
|
||||
def.add_input("B");
|
||||
def.add_output("Y");
|
||||
def.mutable_device_option()->set_device_type(PROTO_CUDA);
|
||||
def.set_engine("NERVANA");
|
||||
AddConstInput(std::vector<int>{5, 10}, 1., "X", &ws);
|
||||
AddConstInput(std::vector<int>{6, 10}, 1., "W", &ws);
|
||||
AddConstInput(std::vector<int>{6}, 0.1, "B", &ws);
|
||||
unique_ptr<OperatorBase> op(
|
||||
new FullyConnectedOp<CUDAContext, NervanaEngine>(def, &ws));
|
||||
EXPECT_NE(nullptr, op.get());
|
||||
EXPECT_TRUE(op->Run());
|
||||
Blob* Yblob = ws.GetBlob("Y");
|
||||
EXPECT_NE(nullptr, Yblob);
|
||||
auto& Y = Yblob->Get<Tensor>();
|
||||
Tensor Y_cpu(Y, CPU);
|
||||
EXPECT_EQ(Y.size(), 5 * 6);
|
||||
for (int i = 0; i < Y.size(); ++i) {
|
||||
CHECK_LT(Y_cpu.data<float>()[i], 10.11);
|
||||
CHECK_GT(Y_cpu.data<float>()[i], 10.09);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,48 +0,0 @@
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/flags.h"
|
||||
|
||||
#include "nervana_c_api.h"
|
||||
|
||||
C10_DEFINE_string(
|
||||
nervana_cubin_path,
|
||||
"/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/cubin/",
|
||||
"The cubin path for nervana kernels. Currently defaulted "
|
||||
"to the internal fb deployment path.");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
static bool g_nervana_kernel_loaded = false;
|
||||
} // namespace
|
||||
|
||||
bool NervanaKernelLoaded() { return g_nervana_kernel_loaded; }
|
||||
|
||||
bool Caffe2InitializeNervanaKernels(int*, char***) {
|
||||
// If we do not specify the nervana cubin path, we will simply return.
|
||||
if (FLAGS_nervana_cubin_path.size() == 0) {
|
||||
VLOG(1) << "Nervana cubin loading skipped.";
|
||||
return true;
|
||||
}
|
||||
g_nervana_kernel_loaded =
|
||||
nervana_loadKernels(FLAGS_nervana_cubin_path.c_str());
|
||||
if (g_nervana_kernel_loaded) {
|
||||
VLOG(1) << "Loaded nervana kernels from path "
|
||||
<< FLAGS_nervana_cubin_path;
|
||||
} else {
|
||||
// Since this is not a critical error we will just vlog it.
|
||||
VLOG(1) << "Cannot load nervana gpu kernels from path "
|
||||
<< FLAGS_nervana_cubin_path
|
||||
<< ", will disable Caffe2 nervana engines.";
|
||||
}
|
||||
// We will always return true for this initialization, because the loading
|
||||
// result is kept and accessible via NervanaKernelLoaded(). This allows us
|
||||
// to register an init function but not forcing the user to have to install
|
||||
// nervana kernels, delaying the failure to the first time a nervana kernel
|
||||
// is actually called.
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2InitializeNervanaKernels,
|
||||
&Caffe2InitializeNervanaKernels,
|
||||
"Initialize nervana kernels for caffe2.");
|
||||
} // namespace caffe2
|
@ -1,53 +0,0 @@
|
||||
#include "nervana.h"
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace math {
|
||||
|
||||
// Caffe2 gemm provides a simpler interface to the gemm functions, with the
|
||||
// limitation that the data has to be contiguous in memory.
|
||||
template <>
|
||||
void Gemm<float, CUDAContext, NervanaEngine>(
|
||||
const CBLAS_TRANSPOSE TransA,
|
||||
const CBLAS_TRANSPOSE TransB,
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const float alpha,
|
||||
const float* A,
|
||||
const float* B,
|
||||
const float beta,
|
||||
float* C,
|
||||
CUDAContext* context,
|
||||
TensorProto::DataType /*math_type*/) {
|
||||
// Note that cublas follows fortran order, so the order is different from
|
||||
// the cblas convention.
|
||||
int lda = (TransA == CblasNoTrans) ? K : M;
|
||||
int ldb = (TransB == CblasNoTrans) ? N : K;
|
||||
bool a_t = (TransA == CblasTrans);
|
||||
bool b_t = (TransB == CblasTrans);
|
||||
CAFFE_ENFORCE(nervana_sgemm(
|
||||
const_cast<float*>(A),
|
||||
const_cast<float*>(B),
|
||||
C,
|
||||
a_t,
|
||||
b_t,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
lda,
|
||||
ldb,
|
||||
N,
|
||||
alpha,
|
||||
beta,
|
||||
nullptr,
|
||||
false,
|
||||
false,
|
||||
context->cuda_stream()));
|
||||
}
|
||||
|
||||
} // namespace math
|
||||
} // namespace caffe2
|
@ -105,10 +105,6 @@ function (caffe2_print_configuration_summary)
|
||||
if(${USE_NCCL})
|
||||
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
|
||||
endif()
|
||||
message(STATUS " USE_NERVANA_GPU : ${USE_NERVANA_GPU}")
|
||||
if(${USE_NERVANA_GPU})
|
||||
message(STATUS " NERVANA_GPU version : ${NERVANA_GPU_VERSION}")
|
||||
endif()
|
||||
message(STATUS " USE_NNPACK : ${USE_NNPACK}")
|
||||
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
|
||||
message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}")
|
||||
|
Reference in New Issue
Block a user