Remove NervanaGPU operators from Caffe2 (#12564)

Summary: Fix #12540 Pull Request resolved: https://github.com/pytorch/pytorch/pull/12564 Reviewed By: orionr Differential Revision: D10379775 Pulled By: soumith fbshipit-source-id: a925b116f2687e56bf54465fc02ca2eb1e7c8eb0
2025-10-20 21:14:14 +08:00 · 2018-10-15 11:02:31 -07:00
parent 151b28521a
commit a1bbe80e21
11 changed files with 0 additions and 787 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -13,9 +13,6 @@
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://github.com/google/googletest.git
-[submodule "third_party/nervanagpu"]
-	path = third_party/nervanagpu
-	url = https://github.com/NervanaSystems/nervanagpu.git
 [submodule "third_party/benchmark"]
 	path = third_party/benchmark
 	url = https://github.com/google/benchmark.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -95,7 +95,6 @@ option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 option(USE_NCCL "Use NCCL" ON)
 option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
-option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
 option(USE_NUMA "Use NUMA (only available on Linux)" ON)
--- a/caffe2/contrib/nervana/CMakeLists.txt
+++ b/caffe2/contrib/nervana/CMakeLists.txt
@ -1,12 +0,0 @@
-if(USE_NERVANA_GPU)
-  message(STATUS "Include Nervana operators")
-  set(Caffe2_CONTRIB_NCCL_GPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_c_api.cu"
-    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_fc_op_gpu.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_init_gpu.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/nervana_math_gpu.cc"
-  )
-
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-endif()
--- a/caffe2/contrib/nervana/nervana.h
+++ b/caffe2/contrib/nervana/nervana.h
@ -1,34 +0,0 @@
-#ifndef CAFFE2_FB_NERVANA_INIT_H_
-#define CAFFE2_FB_NERVANA_INIT_H_
-
-#include "caffe2/core/init.h"
-#include "caffe2/core/flags.h"
-
-#include "nervana_c_api.h"
-
-/**
- * A flag that specifies the nervana cubin path.
- */
-C10_DECLARE_string(nervana_cubin_path);
-
-namespace caffe2 {
-
-/**
- * An empty class to be used in identifying the engine in the math functions.
- */
-class NervanaEngine {};
-
-/**
- * Returns whether the nervana kernels are loaded or not.
- */
-bool NervanaKernelLoaded();
-
-/**
- * An initialization function that is run once by caffe2::GlobalInit()
- * that initializes the nervana kernels.
- */
-bool Caffe2InitializeNervanaKernels();
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_FB_NERVANA_INIT_H_
--- a/caffe2/contrib/nervana/nervana_c_api.cu
+++ b/caffe2/contrib/nervana/nervana_c_api.cu
@ -1,419 +0,0 @@
-/*
- * Copyright 2015 Baidu USA, Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-#include <string>
-#include <map>
-#include <cuda.h>
-#include <iostream>
-#include <sstream>
-#include <mutex>
-#include <tuple>
-#include "nervana_c_api.h"
-
-std::map<CUdevice, int> nervana_sm_counts_;
-std::map<std::string, CUfunction> nervana_kernels_;
-std::vector<CUmodule> nervana_modules_;
-
-//for when we need to modify the above data structures
-std::mutex nervana_load_kernels_mutex_;
-std::mutex nervana_sm_count_mutex_;
-
-extern "C" bool nervana_loadKernels(const char* const base_path_cstr) {
-    std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
-
-    //better would be a vector<string>, but there is a bug in nvcc that prevents this
-    // (bug report filed) (fixed in 7.5)
-    std::string names[36] = {
-        "hgemm_nn_vec_128x128",
-        "hgemm_nn_128x128",
-        "hgemm_nt_vec_128x128",
-        "hgemm_nt_128x128",
-        "hgemm_tn_vec_128x128",
-        "hgemm_tn_128x128",
-        "hgemm_nn_vec_128x64",
-        "hgemm_nn_128x64",
-        "hgemm_tn_vec_128x64",
-        "hgemm_tn_128x64",
-        "hgemm_nn_vec_128x32",
-        "hgemm_nn_128x32",
-        "hgemm_tn_vec_128x32",
-        "hgemm_tn_128x32",
-        "hgemm_nn_32x128",
-        "hgemm_nn_vec_32x128",
-        "hgemm_nt_32x128",
-        "hgemm_nt_vec_32x128",
-        "sgemm_nn_vec_128x128",
-        "sgemm_nn_128x128",
-        "sgemm_nt_vec_128x128",
-        "sgemm_nt_128x128",
-        "sgemm_tn_vec_128x128",
-        "sgemm_tn_128x128",
-        "sgemm_nn_vec_128x64",
-        "sgemm_nn_128x64",
-        "sgemm_tn_vec_128x64",
-        "sgemm_tn_128x64",
-        "sgemm_nn_vec_128x32",
-        "sgemm_nn_128x32",
-        "sgemm_tn_vec_128x32",
-        "sgemm_tn_128x32",
-        "sgemm_nn_32x128",
-        "sgemm_nn_vec_32x128",
-        "sgemm_nt_32x128",
-        "sgemm_nt_vec_32x128"
-    };
-
-    std::string base_path(base_path_cstr);
-
-    for (auto kernel : names) {
-        if (nervana_kernels_.count(kernel) > 0)
-            continue;
-
-        CUmodule module;
-
-        std::string path = base_path + kernel + std::string(".cubin");
-        CUresult res = cuModuleLoad(&module, path.c_str());
-
-        if (res != CUDA_SUCCESS) {
-            // std::cerr << "Failed to load: " << kernel << " " << res << std::endl;
-            return false;
-        }
-
-        nervana_modules_.push_back(module);
-
-        CUfunction function;
-        res = cuModuleGetFunction(&function, module, kernel.c_str());
-        if (res != CUDA_SUCCESS) {
-            // std::cerr << "Failed to extract: " << kernel << " " << res << std::endl;
-            return false;
-        }
-
-        nervana_kernels_.insert(std::make_pair(kernel, function));
-    }
-
-    return true;
-}
-
-extern "C" bool nervana_unloadKernels() {
-    std::lock_guard<std::mutex> lock(nervana_load_kernels_mutex_);
-    while(nervana_modules_.size() > 0) {
-        auto module = nervana_modules_.back();
-        CUresult res = cuModuleUnload(module);
-
-        nervana_modules_.pop_back();
-
-        if (res != CUDA_SUCCESS)
-            return false;
-    }
-
-    nervana_kernels_.clear();
-
-    return true;
-}
-
-extern "C" size_t nervana_randStateSizeBytes() {
-    return 2048 * 32 * sizeof(int);
-}
-
-std::tuple<int, int, int> get_grid_dimensions(int grid, int m, int n, int sm_count, const std::string& trans)
-{
-    int sizeA, sizeB, threads;
-    if (grid >= 0) {
-        if (grid == 0) {
-            sizeA = 32;
-            sizeB = 128;
-            threads = 128;
-        } else if (grid == 1) {
-            sizeA = 128;
-            sizeB = 32;
-            threads = 128;
-        } else if (grid == 2) {
-            sizeA = 128;
-            sizeB = 64;
-            threads = 128;
-        } else if (grid == 3) {
-            sizeA = 128;
-            sizeB = 128;
-            threads = 256;
-        }
-    } else {
-        int sh = min(m, n);
-
-        int size;
-        if (sh < 384 - 16) {
-            int sh128 = sh % 128;
-            if (sh128 > 0 && sh128 < 112) {
-                if (sh128 > 48 && sh128 <= 64) {
-                    int sh64 = sh / 64;
-                    int wide = max(m, n);
-                    sh64 *= (wide / 128 + (wide % 128 != 0)) / sm_count;
-                    if (sh64 > 1) {
-                        size = 64;
-                    }
-                    else {
-                        size = 32;
-                    }
-                }
-                else {
-                    size = 32;
-                }
-            }
-            else {
-                size = 128;
-            }
-        } else {
-            size = 128;
-        }
-
-        if (m >= n) {
-            if (trans == "nt") {
-                size = 128;
-            }
-            sizeA = 128;
-            sizeB = size;
-        } else {
-            if (trans == "tn") {
-                size = 128;
-            } else if (size == 64) {
-                //temporary until kernels exist
-                size = 32;
-            }
-            sizeA = size;
-            sizeB = 128;
-        }
-        threads = (sizeA == 128 && sizeB == 128) ? 256 : 128;
-    }
-
-    return std::make_tuple(sizeA, sizeB, threads);
-}
-
-extern "C" bool nervana_sgemm(float *A, float *B, float *C,
-                              bool a_t, bool b_t,
-                              int m, int n, int k,
-                              int lda, int ldb, int ldc,
-                              float alpha, float beta,
-                              unsigned int *rand_state,
-                              bool stochastic_round, bool apply_relu,
-                              CUstream stream, int grid
-                             )
-{
-    int sm_count;
-    {
-        std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
-
-        CUdevice device;
-        CUresult res = cuCtxGetDevice(&device);
-        if (res != CUDA_SUCCESS) {
-            return false;
-        }
-        auto count = nervana_sm_counts_.find(device);
-        if (count != nervana_sm_counts_.end()) {
-            sm_count = count->second;
-        }
-        else {
-            int pi;
-            res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
-            if (res != CUDA_SUCCESS) {
-                return false;
-            }
-            sm_count = pi;
-            nervana_sm_counts_[device] = pi;
-        }
-    }
-
-    std::string name = "sgemm_";
-
-    std::string trans;
-    trans += a_t ? 't' : 'n';
-    trans += b_t ? 't' : 'n';
-
-    name += trans;
-
-    int sizeA, sizeB, threads;
-
-    std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
-
-    int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
-
-    if ( (trans == "tn" && m % 4 == 0  && n % 4 == 0) ||
-         (trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
-         (trans == "nt" && k % k_vec == 0)) {
-         name += "_vec";
-    }
-
-    int gridA = m / sizeA + (m % sizeA != 0);
-    int gridB = n / sizeB + (n % sizeB != 0);
-    std::stringstream ss;
-    ss << "_" << sizeA << "x" << sizeB;
-    name += ss.str();
-
-    int flags = 0;
-    flags |= (stochastic_round << 0);
-    flags |= (apply_relu << 1);
-
-    CUresult res;
-
-    if (a_t)
-        lda *= (8 * sizeof(float));
-
-    if (!b_t)
-        ldb *= (8 * sizeof(float));
-
-    int zero = 0;
-    void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
-                      &zero, &zero, &zero, &zero};
-
-    res = cuLaunchKernel(nervana_kernels_[name],
-                         1, gridA, gridB,
-                         threads, 1, 1,
-                         0,
-                         stream, args, NULL);
-
-    if (res != CUDA_SUCCESS) {
-        std::cerr << "Error launching kernel " << name << " " << res << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-extern "C" bool nervana_hgemm(short *A, short *B, short *C,
-                              bool a_t, bool b_t,
-                              int m, int n, int k,
-                              int lda, int ldb, int ldc,
-                              float alpha, float beta,
-                              unsigned int *rand_state,
-                              bool stochastic_round, bool apply_relu,
-                              CUstream stream, int grid
-                             )
-{
-    int sm_count;
-    {
-        std::lock_guard<std::mutex> lock(nervana_sm_count_mutex_);
-
-        CUdevice device;
-        CUresult res = cuCtxGetDevice(&device);
-        if (res != CUDA_SUCCESS) {
-            return false;
-        }
-        auto count = nervana_sm_counts_.find(device);
-        if (count != nervana_sm_counts_.end()) {
-            sm_count = count->second;
-        }
-        else {
-            int pi;
-            res = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
-            if (res != CUDA_SUCCESS) {
-                return false;
-            }
-            sm_count = pi;
-            nervana_sm_counts_[device] = pi;
-        }
-    }
-
-    std::string name = "hgemm_";
-
-    std::string trans;
-    trans += a_t ? 't' : 'n';
-    trans += b_t ? 't' : 'n';
-
-    name += trans;
-
-    int sizeA, sizeB, threads;
-
-    std::tie(sizeA, sizeB, threads) = get_grid_dimensions(grid, m, n, sm_count, trans);
-
-    int k_vec = (sizeA == 32 || sizeB == 32) ? 4 : 16;
-
-    if ( (trans == "tn" && m % 4 == 0 && n % 4 == 0) ||
-         (trans == "nn" && k % k_vec == 0 && n % 4 == 0) ||
-         (trans == "nt" && k % k_vec == 0)) {
-         name += "_vec";
-    }
-
-    int gridA = m / sizeA + (m % sizeA != 0);
-    int gridB = n / sizeB + (n % sizeB != 0);
-    std::stringstream ss;
-    ss << "_" << sizeA << "x" << sizeB;
-    name += ss.str();
-
-    int flags = 0;
-    flags |= (stochastic_round << 0);
-    flags |= (apply_relu << 1);
-
-    CUresult res;
-
-    if (a_t)
-        lda *= (8 * sizeof(short));
-
-    if (!b_t)
-        ldb *= (8 * sizeof(short));
-
-    int zero = 0;
-    void *args[17] = {&rand_state, &A, &B, &C, &lda, &ldb, &ldc, &m, &n, &k, &alpha, &beta, &flags,
-                      &zero, &zero, &zero, &zero};
-
-    res = cuLaunchKernel(nervana_kernels_[name],
-                         1, gridA, gridB,
-                         threads, 1, 1,
-                         0,
-                         stream, args, NULL);
-
-    if (res != CUDA_SUCCESS) {
-        std::cerr << "Error launching kernel " << name << " " << res << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-extern "C" bool nervana_sgemm_colmajor(float *A, float *B, float *C,
-                                       bool a_t, bool b_t,
-                                       int m, int n, int k,
-                                       int lda, int ldb, int ldc,
-                                       float alpha, float beta,
-                                       unsigned int *rand_state,
-                                       bool stochastic_round, bool apply_relu,
-                                       CUstream stream, int grid
-                                      )
-{
-    return nervana_sgemm(B, A, C,
-                         b_t, a_t,
-                         n, m, k,
-                         ldb, lda, ldc,
-                         alpha, beta,
-                         rand_state, stochastic_round, apply_relu,
-                         stream, grid);
-}
-
-extern "C" bool nervana_hgemm_colmajor(short *A, short *B, short *C,
-                                       bool a_t, bool b_t,
-                                       int m, int n, int k,
-                                       int lda, int ldb, int ldc,
-                                       float alpha, float beta,
-                                       unsigned int *rand_state,
-                                       bool stochastic_round, bool apply_relu,
-                                       CUstream stream, int grid
-                                      )
-{
-    return nervana_hgemm(B, A, C,
-                         b_t, a_t,
-                         n, m, k,
-                         ldb, lda, ldc,
-                         alpha, beta,
-                         rand_state, stochastic_round, apply_relu,
-                         stream, grid);
-}
--- a/caffe2/contrib/nervana/nervana_c_api.h
+++ b/caffe2/contrib/nervana/nervana_c_api.h
@ -1,132 +0,0 @@
-/*
- * Copyright 2015 Baidu USA, Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-#ifdef __cplusplus
-extern "C" {
-#else
-#include <stdbool.h>
-#endif
-
-/** Load all the sgemm and hgemm cubins from the given path
- * \param [in] base_path path to the kernel cubins
- * \return true on success and false if an error was encountered
- */
-bool nervana_loadKernels(const char* const base_path);
-
-/** Unload all currently loaded cubins
- * \return true on success and false if an error was encountered
- */
-bool nervana_unloadKernels();
-
-/** Return the number of bytes required for the random state
- *  used in stochastic rounding.
- *  \return bytes required for random state
- */
- size_t nervana_randStateSizeBytes();
-
-/** Perform BLAS sgemm on alpha * A * B + beta * C, with the
- *  additional options of stochastic rounding and applying a
- *  rectified linear unit (relu) to the result.  This routine expects
- *  all matrices to be in row-major order.
- *  \param [in] A Pointer to the data for matrix A
- *  \param [in] B Pointer to the data for matrix B
- *  \param [in, out] C Pointer to the data for matrix C
- *  \param [in] m number of rows of C
- *  \param [in] n number of columns of C
- *  \param [in] k inner dimension of multiplication
- *  \param [in] lda leading dimension of two-dimensional array A
- *  \param [in] ldb leading dimension of two-dimensional array B
- *  \param [in] ldc leading dimension of two-dimensional array C
- *  \param [in] alpha scalar used for multiplication
- *  \param [in] beta scalar used for multiplication
- *  \param [in, out] rand_state pointer to memory used for random state
- *              use nervana_randStateSizeBytes to allocate the correct size
- *              if stochastic_round is false, this can be NULL
- *  \param [in] stochastic_round true if stochastic rounding should be used
- *  \param [in] apply_relu true if a relu should be applied to the result
- *  \param [in] stream The cudaStream on which the kernel should be launched
- *  \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
- */
- bool nervana_sgemm(float *A, float *B, float *C,
-                    bool a_t, bool b_t,
-                    int m, int n, int k,
-                    int lda, int ldb, int ldc,
-                    float alpha, float beta,
-                    unsigned int *rand_state,
-                    bool stochastic_round, bool apply_relu,
-                    CUstream stream, int grid=-1
-                    );
-
-/** Perform BLAS hgemm on alpha * A * B + beta * C, with the
- *  additional options of stochastic rounding and applying a
- *  rectified linear unit (relu) to the result.  This routine expects
- *  all matrices to be in row-major order.
- *  \param [in] A Pointer to the data for matrix A
- *  \param [in] B Pointer to the data for matrix B
- *  \param [in, out] C Pointer to the data for matrix C
- *  \param [in] m number of rows of C
- *  \param [in] n number of columns of C
- *  \param [in] k inner dimension of multiplication
- *  \param [in] lda leading dimension of two-dimensional array A
- *  \param [in] ldb leading dimension of two-dimensional array B
- *  \param [in] ldc leading dimension of two-dimensional array C
- *  \param [in] alpha scalar used for multiplication
- *  \param [in] beta scalar used for multiplication
- *  \param [in, out] rand_state pointer to memory used for random state
- *              use nervana_randStateSizeBytes to allocate the correct size
- *              if stochastic_round is false, this can be NULL
- *  \param [in] stochastic_round true if stochastic rounding should be used
- *  \param [in] apply_relu true if a relu should be applied to the result
- *  \param [in] stream The cudaStream on which the kernel should be launched
- *  \param [in] grid Choose a specific grid configuration: 0=32x128, 1=128x32, 2=128x64, 3=128x128
- */
- bool nervana_hgemm(short *A, short *B, short *C,
-                    bool a_t, bool b_t,
-                    int m, int n, int k,
-                    int lda, int ldb, int ldc,
-                    float alpha, float beta,
-                    unsigned int *rand_state,
-                    bool stochastic_round, bool apply_relu,
-                    CUstream stream, int grid=-1
-                    );
-
- bool nervana_sgemm_colmajor(float *A, float *B, float *C,
-                             bool a_t, bool b_t,
-                             int m, int n, int k,
-                             int lda, int ldb, int ldc,
-                             float alpha, float beta,
-                             unsigned int *rand_state,
-                             bool stochastic_round, bool apply_relu,
-                             CUstream stream, int grid=-1
-                             );
-
- bool nervana_hgemm_colmajor(short *A, short *B, short *C,
-                             bool a_t, bool b_t,
-                             int m, int n, int k,
-                             int lda, int ldb, int ldc,
-                             float alpha, float beta,
-                             unsigned int *rand_state,
-                             bool stochastic_round, bool apply_relu,
-                             CUstream stream, int grid=-1
-                             );
-
-#ifdef __cplusplus
-}
-#endif
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu.cc
@ -1,15 +0,0 @@
-#include "nervana.h"
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/fully_connected_op.h"
-
-namespace caffe2 {
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(
-    FC,
-    NERVANA,
-    FullyConnectedOp<CUDAContext, NervanaEngine>);
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(
-    FCGradient,
-    NERVANA,
-    FullyConnectedGradientOp<CUDAContext, NervanaEngine>);
-}  // namespace caffe2
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@ -1,66 +0,0 @@
-#include "nervana.h"
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/blob.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/operators/fully_connected_op.h"
-#include "caffe2/utils/math.h"
-#include "common/gtest/gtest_extensions.h"
-
-#include <gtest/gtest.h>
-
-C10_DECLARE_string(caffe_test_root);
-
-namespace caffe2 {
-
-namespace {
-static void AddConstInput(const std::vector<int>& shape, const float value,
-                          const string& name, Workspace* ws) {
-  DeviceOption option;
-  option.set_device_type(PROTO_CUDA);
-  CUDAContext context(option);
-  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = BlobGetMutableTensor(blob, CUDA);
-  tensor->Resize(shape);
-  math::Set<float, CUDAContext>(tensor->size(), value,
-                                tensor->mutable_data<float>(),
-                                &context);
-  return;
-}
-}  // namespace
-
-TEST(NervanaFullyConnectedTest, Test) {
-  if (!NervanaKernelLoaded()) {
-    SKIP() << "Nervana kernels are not loaded. Skipping test.";
-  }
-  Workspace ws;
-  OperatorDef def;
-  def.set_name("test");
-  def.set_type("FC");
-  def.add_input("X");
-  def.add_input("W");
-  def.add_input("B");
-  def.add_output("Y");
-  def.mutable_device_option()->set_device_type(PROTO_CUDA);
-  def.set_engine("NERVANA");
-  AddConstInput(std::vector<int>{5, 10}, 1., "X", &ws);
-  AddConstInput(std::vector<int>{6, 10}, 1., "W", &ws);
-  AddConstInput(std::vector<int>{6}, 0.1, "B", &ws);
-  unique_ptr<OperatorBase> op(
-      new FullyConnectedOp<CUDAContext, NervanaEngine>(def, &ws));
-  EXPECT_NE(nullptr, op.get());
-  EXPECT_TRUE(op->Run());
-  Blob* Yblob = ws.GetBlob("Y");
-  EXPECT_NE(nullptr, Yblob);
-  auto& Y = Yblob->Get<Tensor>();
-  Tensor Y_cpu(Y, CPU);
-  EXPECT_EQ(Y.size(), 5 * 6);
-  for (int i = 0; i < Y.size(); ++i) {
-    CHECK_LT(Y_cpu.data<float>()[i], 10.11);
-    CHECK_GT(Y_cpu.data<float>()[i], 10.09);
-  }
-}
-
-}  // namespace caffe2
--- a/caffe2/contrib/nervana/nervana_init_gpu.cc
+++ b/caffe2/contrib/nervana/nervana_init_gpu.cc
@ -1,48 +0,0 @@
-#include "caffe2/core/init.h"
-#include "caffe2/core/flags.h"
-
-#include "nervana_c_api.h"
-
-C10_DEFINE_string(
-    nervana_cubin_path,
-    "/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/cubin/",
-    "The cubin path for nervana kernels. Currently defaulted "
-    "to the internal fb deployment path.");
-
-namespace caffe2 {
-
-namespace {
-static bool g_nervana_kernel_loaded = false;
-}  // namespace
-
-bool NervanaKernelLoaded() { return g_nervana_kernel_loaded; }
-
-bool Caffe2InitializeNervanaKernels(int*, char***) {
-  // If we do not specify the nervana cubin path, we will simply return.
-  if (FLAGS_nervana_cubin_path.size() == 0) {
-    VLOG(1) << "Nervana cubin loading skipped.";
-    return true;
-  }
-  g_nervana_kernel_loaded =
-      nervana_loadKernels(FLAGS_nervana_cubin_path.c_str());
-  if (g_nervana_kernel_loaded) {
-    VLOG(1) << "Loaded nervana kernels from path "
-            << FLAGS_nervana_cubin_path;
-  } else {
-    // Since this is not a critical error we will just vlog it.
-    VLOG(1) << "Cannot load nervana gpu kernels from path "
-            << FLAGS_nervana_cubin_path
-            << ", will disable Caffe2 nervana engines.";
-  }
-  // We will always return true for this initialization, because the loading
-  // result is kept and accessible via NervanaKernelLoaded(). This allows us
-  // to register an init function but not forcing the user to have to install
-  // nervana kernels, delaying the failure to the first time a nervana kernel
-  // is actually called.
-  return true;
-}
-
-REGISTER_CAFFE2_INIT_FUNCTION(Caffe2InitializeNervanaKernels,
-                              &Caffe2InitializeNervanaKernels,
-                              "Initialize nervana kernels for caffe2.");
-}  // namespace caffe2
--- a/caffe2/contrib/nervana/nervana_math_gpu.cc
+++ b/caffe2/contrib/nervana/nervana_math_gpu.cc
@ -1,53 +0,0 @@
-#include "nervana.h"
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-namespace math {
-
-// Caffe2 gemm provides a simpler interface to the gemm functions, with the
-// limitation that the data has to be contiguous in memory.
-template <>
-void Gemm<float, CUDAContext, NervanaEngine>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    CUDAContext* context,
-    TensorProto::DataType /*math_type*/) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  bool a_t = (TransA == CblasTrans);
-  bool b_t = (TransB == CblasTrans);
-  CAFFE_ENFORCE(nervana_sgemm(
-      const_cast<float*>(A),
-      const_cast<float*>(B),
-      C,
-      a_t,
-      b_t,
-      M,
-      N,
-      K,
-      lda,
-      ldb,
-      N,
-      alpha,
-      beta,
-      nullptr,
-      false,
-      false,
-      context->cuda_stream()));
-}
-
-}  // namespace math
-}  // namespace caffe2
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -105,10 +105,6 @@ function (caffe2_print_configuration_summary)
  if(${USE_NCCL})
    message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
  endif()
-  message(STATUS "  USE_NERVANA_GPU       : ${USE_NERVANA_GPU}")
-  if(${USE_NERVANA_GPU})
-    message(STATUS "    NERVANA_GPU version : ${NERVANA_GPU_VERSION}")
-  endif()
  message(STATUS "  USE_NNPACK            : ${USE_NNPACK}")
  message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
  message(STATUS "  USE_OBSERVERS         : ${USE_OBSERVERS}")