fbshipit-source-id: ba600fcd2b5cefc7621357bdeb05e24cea02e5af

2025-10-20 21:14:14 +08:00 · 2018-06-27 04:50:56 -07:00
parent 290d20b094
commit 9ec0a2aef4
69 changed files with 989 additions and 300 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,5 +0,0 @@
-# Set the default behavior, in case people don't have core.autocrlf set.
-* text=auto
-
-# BASH scripts shouldn't be converted since they may need to be used by Docker
-*.sh text eol=lf
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@ -41,13 +41,10 @@ std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntList outpu
  return std::make_tuple(output.squeeze(2), indices.squeeze(2));
 }

-std::tuple<Tensor, Tensor> max_pool1d_with_indices(
-    const Tensor& self,
-    IntList kernel_size,
-    IntList stride,
-    IntList padding,
-    IntList dilation,
-    bool ceil_mode) {
+std::tuple<Tensor,Tensor> max_pool1d(
+    const Tensor & self, IntList kernel_size, IntList stride, IntList padding,
+    IntList dilation, bool ceil_mode) {
+
  if (stride.empty()) {
    stride = kernel_size;
  }
@ -58,7 +55,7 @@ std::tuple<Tensor, Tensor> max_pool1d_with_indices(
  check1d("max_pool1d", "dilation", dilation);

  Tensor output, indices;
-  std::tie(output, indices) = at::max_pool2d_with_indices(
+  std::tie(output, indices) = at::max_pool2d(
      self.unsqueeze(2),
      {1, kernel_size[0]},
      {1, stride[0]},
@ -94,41 +91,5 @@ Tensor avg_pool1d(

  return output.squeeze(2);
 }
-
-Tensor max_pool1d(
-    const Tensor& self,
-    IntList kernel_size,
-    IntList stride,
-    IntList padding,
-    IntList dilation,
-    bool ceil_mode) {
-  auto output_and_indices = at::max_pool1d_with_indices(
-      self, kernel_size, stride, padding, dilation, ceil_mode);
-  return std::get<0>(output_and_indices);
-}
-
-Tensor max_pool2d(
-    const Tensor& self,
-    IntList kernel_size,
-    IntList stride,
-    IntList padding,
-    IntList dilation,
-    bool ceil_mode) {
-  auto output_and_indices = at::max_pool2d_with_indices(
-      self, kernel_size, stride, padding, dilation, ceil_mode);
-  return std::get<0>(output_and_indices);
-}
-
-Tensor max_pool3d(
-    const Tensor& self,
-    IntList kernel_size,
-    IntList stride,
-    IntList padding,
-    IntList dilation,
-    bool ceil_mode) {
-  auto output_and_indices = at::max_pool3d_with_indices(
-      self, kernel_size, stride, padding, dilation, ceil_mode);
-  return std::get<0>(output_and_indices);
-}
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -817,16 +817,7 @@

 - func: max_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor

- func: max_pool1d_with_indices(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor)
-  variants: function
-
- func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
-  variants: function
-
- func: max_pool2d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
-  variants: function
-
- func: max_pool3d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
+- func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor)
  variants: function

 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
--- a/aten/src/ATen/nn.yaml
+++ b/aten/src/ATen/nn.yaml
@ -149,12 +149,12 @@
  scalar_check:
    output: 'false'

- name: max_pool2d_with_indices(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, IntList[2] dilation=1, bool ceil_mode=false)
+- name: max_pool2d(Tensor self, IntList[2] kernel_size, IntList[2] stride={}, IntList[2] padding=0, IntList[2] dilation=1, bool ceil_mode=false)
  cname: SpatialDilatedMaxPooling
  default_init:
    stride: kernel_size

- name: max_pool3d_with_indices(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, IntList[3] dilation=1, bool ceil_mode=false)
+- name: max_pool3d(Tensor self, IntList[3] kernel_size, IntList[3] stride={}, IntList[3] padding=0, IntList[3] dilation=1, bool ceil_mode=false)
  cname: VolumetricDilatedMaxPooling
  default_init:
    stride: kernel_size
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -521,7 +521,7 @@ TEST(TensorTest, TensorNonFundamentalType) {
  }
 }

-TEST(TensorTest, TensorNonFundamentalTypeCopy) {
+TEST(TensorTest, TensorNonFundamentalTypeClone) {
  TensorCPU tensor(vector<int>{2, 3, 4});
  std::string* ptr = tensor.mutable_data<std::string>();
  EXPECT_TRUE(ptr != nullptr);
@ -529,11 +529,20 @@ TEST(TensorTest, TensorNonFundamentalTypeCopy) {
    EXPECT_TRUE(ptr[i] == "");
    ptr[i] = "filled";
  }
-  TensorCPU dst_tensor(tensor);
+  TensorCPU dst_tensor = tensor.Clone();
  const std::string* dst_ptr = dst_tensor.data<std::string>();
  for (int i = 0; i < dst_tensor.size(); ++i) {
    EXPECT_TRUE(dst_ptr[i] == "filled");
  }
+  // Change the original tensor
+  for (int i = 0; i < tensor.size(); ++i) {
+    EXPECT_TRUE(ptr[i] == "filled");
+    ptr[i] = "changed";
+  }
+  // Confirm that the cloned tensor is not affect
+  for (int i = 0; i < dst_tensor.size(); ++i) {
+    EXPECT_TRUE(dst_ptr[i] == "filled");
+  }
 }

 TEST(TensorTest, Tensor64BitDimension) {
@ -1060,5 +1069,47 @@ TEST(BlobTest, CastingMessage) {
  }
 }

+TEST(TensorConstruction, UnitializedCopyTest) {
+  CPUContext context;
+  TensorCPU x;
+  TensorCPU y(x, &context);
+  TensorCPU z = x.Clone();
+  // should be uninitialized
+  EXPECT_EQ(x.size(), -1);
+  EXPECT_EQ(y.size(), -1);
+  LOG(INFO) << "z.size()" << z.size();
+  EXPECT_EQ(z.size(), -1);
+}
+
+TEST(TensorConstruction, CopyConstructorTest) {
+  CPUContext context;
+
+  TensorCPU x;
+  x.Resize(5);
+  x.mutable_data<float>()[0] = 1;
+  TensorCPU y = x.Clone();
+  TensorCPU z(x, &context);
+  TensorCPU w;
+
+  EXPECT_EQ(*x.data<float>(), 1);
+  EXPECT_EQ(*y.data<float>(), 1);
+  EXPECT_EQ(*z.data<float>(), 1);
+  x.mutable_data<float>()[0] = 5;
+  EXPECT_EQ(*x.data<float>(), 5);
+  EXPECT_EQ(*y.data<float>(), 1);
+  EXPECT_EQ(*z.data<float>(), 1);
+}
+
+TEST(TensorConstruction, MoveConstructorTest) {
+  CPUContext context;
+
+  TensorCPU x;
+  x.Resize(5);
+  x.mutable_data<float>()[0] = 1;
+  TensorCPU y = std::move(x);
+
+  EXPECT_EQ(*y.data<float>(), 1);
+}
+
 } // namespace
 } // namespace caffe2
--- a/caffe2/core/logging_is_google_glog.h
+++ b/caffe2/core/logging_is_google_glog.h
@ -12,9 +12,9 @@
 #include <cuda.h>
 #endif

-#if (!defined(__CUDACC__) || CUDA_VERSION > 9000 ) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
+#if !defined(__CUDACC__) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
 #include <glog/stl_logging.h>
-#else // (!defined(__CUDACC__) || CUDA_VERSION > 9000 ) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)
+#else // !defined(__CUDACC__) && !defined(CAFFE2_USE_MINIMAL_GOOGLE_GLOG)

 // here, we need to register a fake overload for vector/string - here,
 // we just ignore the entries in the logs.
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
@ -555,13 +555,32 @@ class GivenTensorFill : public NeuralNetOperator {

 class Concat : public NeuralNetOperator {
 public:
-  Concat() : NeuralNetOperator(NNKind::Concat) {}
+  Concat(int axis = -1, bool addAxis = false)
+      : NeuralNetOperator(NNKind::Concat), Axis(axis), AddAxis(addAxis) {}

  ~Concat() {}

  NOMNIGRAPH_DEFINE_NN_RTTI(Concat);

+  int getAxis() const {
+    return Axis;
+  }
+
+  bool getAddAxis() const {
+    return AddAxis;
+  }
+
+  void setAxis(int axis) {
+    Axis = axis;
+  }
+
+  void setAddAxis(bool addAxis) {
+    AddAxis = addAxis;
+  }
+
 private:
+  int Axis;
+  bool AddAxis;
 };

 class Softmax : public NeuralNetOperator {
@ -908,3 +927,68 @@ class Int8MaxPoolRelu : public NeuralNetOperator {

 private:
 };
+
+class BatchMatMul : public NeuralNetOperator {
+ public:
+  BatchMatMul(bool transA = false, bool transB = true, bool broadcast = false)
+      : NeuralNetOperator(NNKind::BatchMatMul),
+        TransA(transA),
+        TransB(transB),
+        Broadcast(broadcast) {}
+
+  ~BatchMatMul() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(BatchMatMul);
+
+  bool getTransA() const {
+    return TransA;
+  }
+
+  bool getTransB() const {
+    return TransB;
+  }
+
+  bool getBroadcast() const {
+    return Broadcast;
+  }
+
+  void setTransA(bool transA) {
+    TransA = transA;
+  }
+
+  void setTransB(bool transB) {
+    TransB = transB;
+  }
+
+  void setBroadcast(bool broadcast) {
+    Broadcast = broadcast;
+  }
+
+ private:
+  bool TransA;
+  bool TransB;
+  bool Broadcast;
+};
+
+class BatchGather : public NeuralNetOperator {
+ public:
+  BatchGather() : NeuralNetOperator(NNKind::BatchGather) {}
+
+  ~BatchGather() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(BatchGather);
+
+ private:
+};
+
+class ConcatBatchMatMulBatchGatherOp : public NeuralNetOperator {
+ public:
+  ConcatBatchMatMulBatchGatherOp()
+      : NeuralNetOperator(NNKind::ConcatBatchMatMulBatchGatherOp) {}
+
+  ~ConcatBatchMatMulBatchGatherOp() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(ConcatBatchMatMulBatchGatherOp);
+
+ private:
+};
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
@ -5,4 +5,5 @@ Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool,
    Int8Conv, Int8ConvTranspose, Int8FC, Int8MaxPool, Int8Relu,
    Int8GivenTensorFill, Int8Concat, Int8Softmax, Int8ChannelShuffle, Int8Sum,
    Int8Add, Int8Reshape, Int8Flatten, Int8ConvRelu, Int8SumRelu,
-    Int8AveragePoolRelu, Int8MaxPoolRelu
+    Int8AveragePoolRelu, Int8MaxPoolRelu, BatchMatMul, BatchGather,
+    ConcatBatchMatMulBatchGatherOp
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
@ -84,3 +84,9 @@ case NNKind::Int8AveragePoolRelu:
  return "Int8AveragePoolRelu";
 case NNKind::Int8MaxPoolRelu:
  return "Int8MaxPoolRelu";
+case NNKind::BatchMatMul:
+  return "BatchMatMul";
+case NNKind::BatchGather:
+  return "BatchGather";
+case NNKind::ConcatBatchMatMulBatchGatherOp:
+  return "ConcatBatchMatMulBatchGatherOp";
--- a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
@ -14,6 +14,24 @@
 #include <functional>
 #include <list>

+// These #defines are useful when writing passes as the collapse
+//
+// if (!cond) {
+//   continue; // or break; or return;
+// }
+//
+// into a single line without negation
+
+#define NOM_REQUIRE_OR_(_cond, _expr) \
+  if (!(_cond)) {                     \
+    _expr;                            \
+  }
+
+#define NOM_REQUIRE_OR_CONT(_cond) NOM_REQUIRE_OR_(_cond, continue)
+#define NOM_REQUIRE_OR_BREAK(_cond) NOM_REQUIRE_OR_(_cond, break)
+#define NOM_REQUIRE_OR_RET_NULL(_cond) NOM_REQUIRE_OR_(_cond, return nullptr)
+#define NOM_REQUIRE_OR_RET(_cond) NOM_REQUIRE_OR_(_cond, return )
+
 // Implements accessors for a generic type T. If the type is not
 // specified (i.e., void template type) then the partial specification
 // gives an empty type.
--- a/caffe2/core/nomnigraph/ops.def
+++ b/caffe2/core/nomnigraph/ops.def
@ -55,6 +55,8 @@ BatchNormalization
 FC
 GivenTensorFill
 Concat
+- Axis : int : -1
+- AddAxis : bool : false
 Softmax
 ChannelShuffle
 Add
@ -84,3 +86,10 @@ Int8ConvRelu : ConvRelu
 Int8SumRelu : SumRelu
 Int8AveragePoolRelu : AveragePoolRelu
 Int8MaxPoolRelu : MaxPoolRelu
+
+BatchMatMul
+- TransA : bool : false
+- TransB : bool : true
+- Broadcast: bool : false
+BatchGather
+ConcatBatchMatMulBatchGatherOp
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@ -124,7 +124,7 @@ struct WorkspaceIdInjector {
  void InjectWorkspaceId(Workspace* workspace) {
    if (workspace->HasBlob(NODE_ID)) {
      Blob* node_id_blob = workspace->GetBlob(NODE_ID);
-      TensorCPU node_id_tensor = node_id_blob->template Get<TensorCPU>();
+      const TensorCPU& node_id_tensor = node_id_blob->template Get<TensorCPU>();
      int node_id = node_id_tensor.template data<int32_t>()[0];
      CAFFE_ENFORCE(
          seq_ < (1 << 16),
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@ -168,6 +168,15 @@ class Tensor {
      return;
    }
    meta_ = src.meta();
+    if (src.size() == -1) {
+      dims_.clear();
+      size_ = -1;
+      data_.reset();
+      shares_data_ = false;
+      capacity_ = 0;
+      reserved_ = false;
+      return;
+    }
    Resize(src.dims());
    if (size() > 0) {
      if (meta_.copy()) {
@ -681,6 +690,21 @@ class Tensor {
    return dims_[i];
  }

+  Tensor Clone() const {
+    Tensor x;
+    x.CopyFrom(*this);
+    return x;
+  }
+
+  Tensor(Tensor<Context>&& src) noexcept {
+    swap(src);
+  }
+
+  /**
+   * @brief Delete the copy constructor and use Clone explicitly
+   */
+  Tensor(const Tensor<Context>& src) = delete;
+
 protected:
  vector<TIndex> dims_;
  TIndex size_ = -1;
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -27,7 +27,9 @@ class CaffeTypeId final : public c10::guts::IdWrapper<CaffeTypeId, uint16_t> {
 public:
  static CaffeTypeId createTypeId();

-  friend std::ostream& operator<<(std::ostream& stream, CaffeTypeId typeId);
+  friend std::ostream& operator<<(std::ostream& stream, CaffeTypeId typeId) {
+    return stream << typeId.underlyingId();
+  }
  friend bool operator<(CaffeTypeId lhs, CaffeTypeId rhs);

  // TODO Can we get rid of uninitialized?
@ -39,10 +41,6 @@ private:
    constexpr explicit CaffeTypeId(uint16_t id): IdWrapper(id) {}
 };

-inline std::ostream& operator<<(std::ostream& stream, CaffeTypeId typeId) {
-  return stream << typeId.underlyingId();
-}
-
 // Allow usage in std::map / std::set
 // TODO Disallow this and rather use std::unordered_map/set everywhere
 inline bool operator<(CaffeTypeId lhs, CaffeTypeId rhs) {
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.cc
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/fully_connected_op_decomposition.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.h
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_
 #define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_

--- a/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/experiments/operators/fully_connected_op_decomposition.h"

--- a/caffe2/experiments/operators/fully_connected_op_prune.cc
+++ b/caffe2/experiments/operators/fully_connected_op_prune.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/fully_connected_op_prune.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_
 #define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_

--- a/caffe2/experiments/operators/fully_connected_op_sparse.cc
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/fully_connected_op_sparse.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_
 #define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_

--- a/caffe2/experiments/operators/funhash_op.cc
+++ b/caffe2/experiments/operators/funhash_op.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/funhash_op.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/funhash_op.h
+++ b/caffe2/experiments/operators/funhash_op.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_FUNHASH_OP_H_
 #define CAFFE2_OPERATORS_FUNHASH_OP_H_

--- a/caffe2/experiments/operators/sparse_funhash_op.cc
+++ b/caffe2/experiments/operators/sparse_funhash_op.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/sparse_funhash_op.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/sparse_funhash_op.h
+++ b/caffe2/experiments/operators/sparse_funhash_op.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
 #define CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_

--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/sparse_matrix_reshape_op.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
 #define CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_

--- a/caffe2/experiments/operators/tt_contraction_op.cc
+++ b/caffe2/experiments/operators/tt_contraction_op.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/tt_contraction_op.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/tt_contraction_op.h
+++ b/caffe2/experiments/operators/tt_contraction_op.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
 #define CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_

--- a/caffe2/experiments/operators/tt_contraction_op_gpu.cc
+++ b/caffe2/experiments/operators/tt_contraction_op_gpu.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/experiments/operators/tt_contraction_op.h"

--- a/caffe2/experiments/operators/tt_pad_op.cc
+++ b/caffe2/experiments/operators/tt_pad_op.cc
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "caffe2/experiments/operators/tt_pad_op.h"

 namespace caffe2 {
--- a/caffe2/experiments/operators/tt_pad_op.h
+++ b/caffe2/experiments/operators/tt_pad_op.h
@ -1,3 +1,19 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef CAFFE2_OPERATORS_TT_PAD_OP_H_
 #define CAFFE2_OPERATORS_TT_PAD_OP_H_

--- a/caffe2/experiments/python/SparseTransformer.py
+++ b/caffe2/experiments/python/SparseTransformer.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 ## @package SparseTransformer
 # Module caffe2.experiments.python.SparseTransformer
 from __future__ import absolute_import
--- a/caffe2/experiments/python/convnet_benchmarks.py
+++ b/caffe2/experiments/python/convnet_benchmarks.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 ## @package convnet_benchmarks
 # Module caffe2.experiments.python.convnet_benchmarks
 from __future__ import absolute_import
--- a/caffe2/experiments/python/device_reduce_sum_bench.py
+++ b/caffe2/experiments/python/device_reduce_sum_bench.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 ## @package device_reduce_sum_bench
 # Module caffe2.experiments.python.device_reduce_sum_bench
 from __future__ import absolute_import
--- a/caffe2/experiments/python/funhash_op_test.py
+++ b/caffe2/experiments/python/funhash_op_test.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/net_construct_bench.py
+++ b/caffe2/experiments/python/net_construct_bench.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 ## @package net_construct_bench
 # Module caffe2.experiments.python.net_construct_bench
 from __future__ import absolute_import
--- a/caffe2/experiments/python/sparse_funhash_op_test.py
+++ b/caffe2/experiments/python/sparse_funhash_op_test.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/sparse_reshape_op_test.py
+++ b/caffe2/experiments/python/sparse_reshape_op_test.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/tt_contraction_op_test.py
+++ b/caffe2/experiments/python/tt_contraction_op_test.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/experiments/python/tt_pad_op_test.py
+++ b/caffe2/experiments/python/tt_pad_op_test.py
@ -1,3 +1,18 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
--- a/caffe2/mobile/contrib/libopencl-stub/Android.mk
+++ b/caffe2/mobile/contrib/libopencl-stub/Android.mk
@ -1,15 +0,0 @@
-# Android makefile
-# Build this using ndk as
-# ndk-build NDK_PROJECT_PATH=.  APP_BUILD_SCRIPT=Android.mk
-#
-
-LOCAL_PATH := $(call my-dir)
-
-include $(CLEAR_VARS)
-LOCAL_MODULE := libOpenCL
-LOCAL_C_INCLUDES := $(LOCAL_PATH)/include/
-LOCAL_SRC_FILES :=  src/libopencl.c
-LOCAL_CFLAGS   = -fPIC -O2
-
-include $(BUILD_STATIC_LIBRARY)
-
--- a/caffe2/mobile/contrib/ulp2/ulp.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp.cc
@ -286,7 +286,8 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
 #endif
  };
  if (b) {
-    state->bias = caffe2::make_unique<TensorCPU>(*b);
+    CPUContext context;
+    state->bias = caffe2::make_unique<TensorCPU>(*b, &context);
  }
  return state;
 }
--- a/caffe2/operators/abs_op.cc
+++ b/caffe2/operators/abs_op.cc
@ -78,7 +78,7 @@ Y: [0.3005476  1.551666   1.3591481  0.39191285 0.21866608]
 </details>

 )DOC")
-    .Input(0, "X", "*(type: Tensor<float\>)* Input tensor.")
+    .Input(0, "X", "*(type: Tensor<float\\>)* Input tensor.")
    .Output(
        0,
        "Y",
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@ -59,7 +59,7 @@ void elementwiseAnd() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{true, false, false, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -79,7 +79,7 @@ void elementwiseAnd() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{
        true, false, false, false, true, false, false, false};
@ -105,7 +105,7 @@ void elementwiseOr() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{true, true, true, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -125,7 +125,7 @@ void elementwiseOr() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{true, true, true, false, true, true, true, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -150,7 +150,7 @@ void elementwiseXor() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{false, true, true, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -170,7 +170,7 @@ void elementwiseXor() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{
        false, true, true, false, false, true, true, false};
@ -195,7 +195,7 @@ void elementwiseNot() {
  EXPECT_TRUE(op->Run());
  auto* blob = ws.GetBlob("Y");
  EXPECT_NE(nullptr, blob);
-  caffe2::TensorCPU Y(blob->Get<caffe2::Tensor<Context>>());
+  const auto& Y = blob->Get<caffe2::Tensor<Context>>();
  EXPECT_EQ(Y.size(), N);
  std::vector<bool> result{false, true};
  for (size_t i = 0; i < Y.size(); ++i) {
@ -217,7 +217,7 @@ void elementwiseEQ() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{false, true, false, true};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -234,7 +234,7 @@ void elementwiseEQ() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), N);
    std::vector<bool> result{true, true, false, false};
    for (size_t i = 0; i < Z.size(); ++i) {
@ -253,7 +253,7 @@ void elementwiseEQ() {
    EXPECT_TRUE(op->Run());
    auto* blob = ws.GetBlob("Z");
    EXPECT_NE(nullptr, blob);
-    caffe2::TensorCPU Z(blob->Get<caffe2::Tensor<Context>>());
+    const auto& Z = blob->Get<caffe2::Tensor<Context>>();
    EXPECT_EQ(Z.size(), M * N);
    std::vector<bool> result{
        true, false, false, true, false, true, true, false};
--- a/caffe2/operators/reduction_ops.cc
+++ b/caffe2/operators/reduction_ops.cc
@ -296,13 +296,14 @@ bool SumElementsGradientOp<T, Context>::RunOnDevice()
 #endif
 {
  auto& X = Input(0);
-  TensorCPU sum_grad = TensorCPU(Input(1));
+  const auto& sum_grad = Input(1);
  auto* dX = Output(0);
  dX->ResizeLike(X);
  DCHECK_EQ(sum_grad.size(), 1);
  math::Set<T, Context>(
      dX->size(),
-      static_cast<T>(sum_grad.data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
+      static_cast<T>(
+          sum_grad.template data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
      dX->template mutable_data<T>(),
      &context_);
  return true;
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@ -385,7 +385,7 @@ lengths_out: [5]
    .Output(
        0,
        "data_out",
-        "*(type: Tensor)* Padded data tensor ($T<N + 2*padding\_width, "
+        "*(type: Tensor)* Padded data tensor ($T<N + 2*padding\\_width, "
        "D_1, ..., D_n>$).")
    .Output(
        1,
@ -483,7 +483,7 @@ lengths_out_rm: [3]
        0,
        "data_out",
        "*(type: Tensor)* Padded data tensor "
-        "($T<N + 2*padding\_width, D_1, ..., D_n>$).")
+        "($T<N + 2*padding\\_width, D_1, ..., D_n>$).")
    .Output(
        1,
        "lengths_out",
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@ -128,6 +128,49 @@ convertToNeuralNetOperator(caffe2::OperatorDef* op) {
    nnOp = util::make_unique<repr::BatchNormalization>();
  }

+  if (op->type() == "Concat") {
+    nnOp = util::make_unique<repr::Concat>();
+    auto c = dyn_cast<repr::Concat>(nnOp.get());
+    if (argMap.count("axis")) {
+      CAFFE_ENFORCE(argMap["axis"].has_i(), "Invalid axis argument");
+      int axis = static_cast<int>(argMap["axis"].i());
+      c->setAxis(axis);
+    }
+    if (argMap.count("add_axis")) {
+      CAFFE_ENFORCE(argMap["add_axis"].has_i(), "Invalid add_axis argument");
+      int add_axis = static_cast<int>(argMap["add_axis"].i());
+      c->setAddAxis(!!add_axis);
+    }
+  }
+
+  if (op->type() == "Flatten") {
+    nnOp = util::make_unique<repr::Flatten>();
+  }
+
+  if (op->type() == "BatchGather") {
+    nnOp = util::make_unique<repr::BatchGather>();
+  }
+
+  if (op->type() == "BatchMatMul") {
+    nnOp = util::make_unique<repr::BatchMatMul>();
+    auto c = dyn_cast<repr::BatchMatMul>(nnOp.get());
+    if (argMap.count("trans_a")) {
+      CAFFE_ENFORCE(argMap["trans_a"].has_i(), "Invalid axis argument");
+      int trans_a = static_cast<int>(argMap["trans_a"].i());
+      c->setTransA(!!trans_a);
+    }
+    if (argMap.count("trans_b")) {
+      CAFFE_ENFORCE(argMap["trans_b"].has_i(), "Invalid add_axis argument");
+      int trans_b = static_cast<int>(argMap["trans_b"].i());
+      c->setTransB(!!trans_b);
+    }
+    if (argMap.count("broadcast")) {
+      CAFFE_ENFORCE(argMap["broadcast"].has_i(), "Invalid add_axis argument");
+      int broadcast = static_cast<int>(argMap["broadcast"].i());
+      c->setBroadcast(!!broadcast);
+    }
+  }
+
  if (!nnOp) {
    nnOp = util::make_unique<repr::GenericOperator>(op->type());
  }
--- a/caffe2/opt/passes.h
+++ b/caffe2/opt/passes.h
@ -25,7 +25,7 @@ class OptimizationPass {
 public:
  OptimizationPass(NNModule* nn) : nn_(nn) {}
  virtual void run() = 0;
-  virtual ~OptimizationPass() = 0;
+  virtual ~OptimizationPass(){}

 protected:
  NNModule* nn_;
@ -34,6 +34,7 @@ class OptimizationPass {
 class WorkspaceOptimizationPass : public OptimizationPass {
 public:
  WorkspaceOptimizationPass(NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {}
+  virtual ~WorkspaceOptimizationPass(){}

 protected:
  Workspace* ws_;
@ -42,26 +43,28 @@ class WorkspaceOptimizationPass : public OptimizationPass {
 CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
 #define REGISTER_WS_OPT_PASS(clsname) \
  CAFFE_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
-#define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname) \
-  class passname : public WorkspaceOptimizationPass { \
-   public: \
+#define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname)      \
+  class passname : public WorkspaceOptimizationPass {           \
+   public:                                                      \
    using WorkspaceOptimizationPass::WorkspaceOptimizationPass; \
-    void run() override { \
-      funcname(nn_, ws_); \
-    } \
-  };
+    void run() override {                                       \
+      funcname(nn_, ws_);                                       \
+    }                                                           \
+  };                                                            \
+  REGISTER_WS_OPT_PASS(passname);

 CAFFE_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
 #define REGISTER_OPT_PASS(clsname) \
  CAFFE_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
 #define REGISTER_OPT_PASS_FROM_FUNC(passname, funcname) \
-  class passname : public OptimizationPass { \
-   public: \
-    using OptimizationPass::OptimizationPass; \
-    void run() override { \
-      funcname(nn_); \
-    } \
-  };
+  class passname : public OptimizationPass {            \
+   public:                                              \
+    using OptimizationPass::OptimizationPass;           \
+    void run() override {                               \
+      funcname(nn_);                                    \
+    }                                                   \
+  };                                                    \
+  REGISTER_OPT_PASS(passname);

 } // namespace caffe2

--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@ -91,6 +91,7 @@ class LayerModelHelper(model_helper.ModelHelper):
        # additional (hard-coded) diagnose_options to report based on the model
        # TODO(xlwang): it's hack!
        self.ad_hoc_diagnose_blobs_and_operations = []
+        self.ad_hoc_plot_blobs = []

    def clear_output_schema(self):
        self._output_schema = None
@ -105,6 +106,11 @@ class LayerModelHelper(model_helper.ModelHelper):
            (name, value)
        )

+    def add_ad_hoc_plot_blob(self, blob, dtype=None):
+        dtype = dtype or (np.float, (1, ))
+        self.add_metric_field(str(blob), schema.Scalar(dtype, blob))
+        self.ad_hoc_plot_blobs.append(blob)
+
    @staticmethod
    def _get_global_constant_initializer_op(
        blob_name, array=None, dtype=None, initializer=None
--- a/caffe2/python/layers/adaptive_weight.py
+++ b/caffe2/python/layers/adaptive_weight.py
@ -22,7 +22,9 @@ class AdaptiveWeight(ModelLayer):
        optimizer=None,
        weights=None,
        enable_diagnose=False,
-        estimation_method=None,
+        estimation_method="log_std",
+        pos_optim_method="log_barrier",
+        reg_lambda=0.1,
        **kwargs
    ):
        super(AdaptiveWeight, self).__init__(model, name, input_record, **kwargs)
@ -38,20 +40,23 @@ class AdaptiveWeight(ModelLayer):
            weights = [1. / self.num for _ in range(self.num)]
        assert min(weights) > 0, "initial weights must be positive"
        self.weights = np.array(weights).astype(np.float32)
-        self.estimation_method = estimation_method
-        if self.estimation_method is not None:
-            self.estimation_method_type = infer_thrift_union_selection(
-                estimation_method
-            ).lower()
-            self.estimation_method_value = estimation_method.value
-        else:
-            self.estimation_method_type = "log_std"
-            self.estimation_method_value = None
+        self.estimation_method = str(estimation_method).lower()
+        # used in positivity-constrained parameterization as when the estimation method
+        # is inv_var, with optimization method being either log barrier, or grad proj
+        self.pos_optim_method = str(pos_optim_method).lower()
+        self.reg_lambda = float(reg_lambda)
        self.enable_diagnose = enable_diagnose
-        self.init_func = getattr(self, self.estimation_method_type + "_init")
-        self.weight_func = getattr(self, self.estimation_method_type + "_weight")
-        self.reg_func = getattr(self, self.estimation_method_type + "_reg")
+        self.init_func = getattr(self, self.estimation_method + "_init")
+        self.weight_func = getattr(self, self.estimation_method + "_weight")
+        self.reg_func = getattr(self, self.estimation_method + "_reg")
        self.init_func()
+        if self.enable_diagnose:
+            self.weight_i = [
+                self.get_next_blob_reference("adaptive_weight_%d" % i)
+                for i in range(self.num)
+            ]
+            for i in range(self.num):
+                self.model.add_ad_hoc_plot_blob(self.weight_i[i])

    def concat_data(self, net):
        reshaped = [net.NextScopedBlob("reshaped_data_%d" % i) for i in range(self.num)]
@ -110,15 +115,15 @@ class AdaptiveWeight(ModelLayer):
            "GivenTensorFill",
            {"values": values, "dtype": core.DataType.FLOAT},
        )
-        pos_optim_method = self.estimation_method_value.pos_optim_method.getType()
-        pos_optim_option = self.estimation_method_value.pos_optim_method.value
-        if pos_optim_method == "LOG_BARRIER":
-            regularizer = LogBarrier(float(reg_lambda=pos_optim_option.reg_lambda))
-        elif pos_optim_method == "POS_GRAD_PROJ":
+        if self.pos_optim_method == "log_barrier":
+            regularizer = LogBarrier(reg_lambda=self.reg_lambda)
+        elif self.pos_optim_method == "pos_grad_proj":
            regularizer = BoundedGradientProjection(lb=0, left_open=True)
        else:
            raise TypeError(
-                "unknown positivity optimization method: {}".format(pos_optim_method)
+                "unknown positivity optimization method: {}".format(
+                    self.pos_optim_method
+                )
            )
        self.k = self.create_param(
            param_name="k",
@ -136,7 +141,7 @@ class AdaptiveWeight(ModelLayer):
        net.Log(self.k, log_k)
        net.Scale(log_k, reg, scale=-0.5)

-    def add_ops(self, net):
+    def _add_ops_impl(self, net, enable_diagnose):
        x = self.concat_data(net)
        weight = net.NextScopedBlob("weight")
        reg = net.NextScopedBlob("reg")
@ -147,21 +152,9 @@ class AdaptiveWeight(ModelLayer):
        net.Mul([weight, x], weighted_x)
        net.Add([weighted_x, reg], weighted_x_add_reg)
        net.SumElements(weighted_x_add_reg, self.output_schema())
-        if self.enable_diagnose:
+        if enable_diagnose:
            for i in range(self.num):
-                weight_i = net.NextScopedBlob("weight_%d" % i)
-                net.Slice(weight, weight_i, starts=[i], ends=[i + 1])
+                net.Slice(weight, self.weight_i[i], starts=[i], ends=[i + 1])

-
-def infer_thrift_union_selection(ttype_union):
-    # TODO(xlwang): this is a hack way to infer the type str of a thrift union
-    # struct
-    assert ttype_union.isUnion(), "type {} is not a thrift union".format(
-        type(ttype_union)
-    )
-    field = ttype_union.field
-    for attr in dir(ttype_union):
-        v = getattr(ttype_union, attr)
-        if isinstance(v, int) and attr != "field" and v == field:
-            return attr
-    raise ValueError("Fail to infer the thrift union type")
+    def add_ops(self, net):
+        self._add_ops_impl(net, self.enable_diagnose)
--- a/caffe2/python/layers_test.py
+++ b/caffe2/python/layers_test.py
@ -1809,25 +1809,50 @@ class TestLayers(LayersTestCase):
    @given(
        num=st.integers(min_value=10, max_value=100),
        feed_weight=st.booleans(),
+        use_inv_var_parameterization=st.booleans(),
+        use_log_barrier=st.booleans(),
+        enable_diagnose=st.booleans(),
        **hu.gcs
    )
-    def testAdaptiveWeight(self, num, feed_weight, gc, dc):
+    def testAdaptiveWeight(
+        self, num, feed_weight, use_inv_var_parameterization, use_log_barrier,
+        enable_diagnose, gc, dc
+    ):
        input_record = self.new_record(schema.RawTuple(num))
        data = np.random.random(num)
        schema.FeedRecord(
-            input_record,
-            [np.array(x).astype(np.float32) for x in data]
+            input_record, [np.array(x).astype(np.float32) for x in data]
        )
        weights = np.random.random(num) if feed_weight else None
-        result = self.model.AdaptiveWeight(input_record, weights=weights)
+        result = self.model.AdaptiveWeight(
+            input_record,
+            weights=weights,
+            estimation_method=(
+                'inv_var' if use_inv_var_parameterization else 'log_std'
+            ),
+            pos_optim_method=(
+                'log_barrier' if use_log_barrier else 'pos_grad_proj'
+            ),
+            enable_diagnose=enable_diagnose
+        )
        train_init_net, train_net = self.get_training_nets(True)
        workspace.RunNetOnce(train_init_net)
        workspace.RunNetOnce(train_net)
        result = workspace.FetchBlob(result())
        if not feed_weight:
-            weights = 1. / num
+            weights = np.array([1. / num for _ in range(num)])
        expected = np.sum(weights * data + 0.5 * np.log(1. / 2. / weights))
        npt.assert_allclose(expected, result, atol=1e-4, rtol=1e-4)
+        if enable_diagnose:
+            assert len(self.model.ad_hoc_plot_blobs) == num
+            reconst_weights_from_ad_hoc = np.array(
+                [workspace.FetchBlob(b) for b in self.model.ad_hoc_plot_blobs]
+            ).flatten()
+            npt.assert_allclose(
+                reconst_weights_from_ad_hoc, weights, atol=1e-4, rtol=1e-4
+            )
+        else:
+            assert len(self.model.ad_hoc_plot_blobs) == 0

    @given(num=st.integers(min_value=10, max_value=100), **hu.gcs)
    def testConstantWeight(self, num, gc, dc):
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@ -21,6 +21,7 @@
 #include "caffe2/opt/fusion.h"
 #include "caffe2/opt/mobile.h"
 #include "caffe2/opt/optimize_ideep.h"
+#include "caffe2/opt/passes.h"
 #include "caffe2/opt/sink.h"
 #include "caffe2/utils/cpuid.h"
 #include "caffe2/utils/string_utils.h"
@ -1481,6 +1482,45 @@ void addGlobalMethods(py::module& m) {
  CAFFE2_CPU_FEATURE_SUPPORT(avx2);

 #undef CAFFE2_CPU_FEATURE_SUPPORT
+  m.def("transform_exists", [](const std::string& transform_name) {
+    return OptimizationPassRegistry()->Has(transform_name);
+  });
+  m.def("workspace_transform_exists", [](const std::string& transform_name) {
+    return WorkspaceOptimizationPassRegistry()->Has(transform_name);
+  });
+  m.def("run_transform", [](const std::string& transform_name, py::bytes def) {
+    caffe2::NetDef proto;
+    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+    auto nn = caffe2::convertToNNModule(proto);
+    auto pass = OptimizationPassRegistry()->Create(transform_name, &nn);
+
+    CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
+    pass->run();
+
+    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+    std::string out;
+    new_proto.SerializeToString(&out);
+    return py::bytes(out);
+  });
+  m.def(
+      "run_workspace_transform",
+      [](const std::string& transform_name, py::bytes def) {
+        CAFFE_ENFORCE(gWorkspace);
+        caffe2::NetDef proto;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+        auto nn = caffe2::convertToNNModule(proto);
+        auto pass = WorkspaceOptimizationPassRegistry()->Create(
+            transform_name, &nn, gWorkspace);
+
+        CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
+        pass->run();
+
+        auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
+        std::string out;
+        new_proto.SerializeToString(&out);
+        return py::bytes(out);
+      });

  // Transformations are exposed as functions here and wrapped
  // into a python interface in transformations.py
--- a/caffe2/python/pybind_state_int8.cc
+++ b/caffe2/python/pybind_state_int8.cc
@ -32,7 +32,7 @@ namespace python {
 class Int8TensorFetcher : public BlobFetcherBase {
 public:
  pybind11::object Fetch(const Blob& blob) override {
-    const caffe2::int8::Int8TensorCPU src =
+    const caffe2::int8::Int8TensorCPU& src =
        blob.template Get<caffe2::int8::Int8TensorCPU>();
    const int numpy_type = CaffeToNumpyType(src.t.meta());
    CAFFE_ENFORCE(numpy_type != -1, "Int8Tensor contains unknown type data");
--- a/caffe2/python/transformations.py
+++ b/caffe2/python/transformations.py
@ -21,10 +21,23 @@ from __future__ import unicode_literals
 import caffe2.python._import_c_extension as C


-def addNNPACK(net):
-    net.Proto().ParseFromString(
-        C.transform_addNNPACK(net.Proto().SerializeToString())
-    )
+class Transformer(object):
+    def __init__(self):
+        pass
+
+    @classmethod
+    def runTransform(cls, transform_name, net):
+        pb = net.Proto().SerializeToString()
+        if C.transform_exists(transform_name):
+            output = C.run_transform(transform_name, pb)
+        elif C.workspace_transform_exists(transform_name):
+            output = C.run_workspace_transform(transform_name, pb)
+        else:
+            raise AttributeError('Transformation {} not found.'.format(transform_name))
+        net.Proto().ParseFromString(output)
+
+    def __getattr__(self, transform_name):
+        return lambda net : self.runTransform(transform_name, net)


 def fuseNNPACKConvRelu(net):
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@ -22,14 +22,11 @@ from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np

-from caffe2.python.transformations import (
-    addNNPACK,
-    fuseNNPACKConvRelu,
-    fuseConvBN,
-    sinkMaxPool,
-)
+from caffe2.python.transformations import Transformer
 from caffe2.python import core, workspace, test_util

+transformer = Transformer()
+

 def str_compare(a, b, encoding="utf8"):
    if isinstance(a, bytes):
@ -40,26 +37,21 @@ def str_compare(a, b, encoding="utf8"):


 class TestTransformations(test_util.TestCase):
-    def test_addNNPACK(self):
+    def test_transformer_AddNNPACK(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["Y2"])
-        addNNPACK(net)
+        transformer.AddNNPACK(net)
        assert str_compare(net.Proto().op[0].engine, "NNPACK")

-
-    def test_fuseNNPACKConvRelu(self):
+    def test_transformer_FuseNNPACKConvRelu(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["Y2"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 1)
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 1
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -69,31 +61,27 @@ class TestTransformations(test_util.TestCase):

    def test_noFuseNNPACKConvRelu(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["Y2"])
        net.Relu(["Y"], ["Y3"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 3)
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 3
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation") and str_compare(arg.s, "Relu"):
                has_activation_arg = True
        assert not has_activation_arg

-    def test_fuseNNPACKConvReluNoInplace(self):
+    def test_transformer_FuseNNPACKConvReluNoInplace(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["X"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 1)
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 1
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -102,16 +90,14 @@ class TestTransformations(test_util.TestCase):
        assert has_activation_arg
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]

-    def test_fuseNNPACKConvReluInplaceRelu(self):
+    def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["Y"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 1)
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 1
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -120,19 +106,15 @@ class TestTransformations(test_util.TestCase):
        assert has_activation_arg
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]

-    def test_fuseNNPACKConvReluPingPongNaming(self):
+    def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["X"])
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
-        addNNPACK(net) # get the NNPACK engine
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 2)
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 2
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -142,20 +124,16 @@ class TestTransformations(test_util.TestCase):
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]

-    def test_fuseNNPACKConvReluFollowedByMultipleInputOp(self):
+    def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["Y2"])
-        net.Conv(
-            ["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["Y2"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 2)
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 2
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -165,20 +143,16 @@ class TestTransformations(test_util.TestCase):
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]

-    def test_fuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
+    def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y"], ["Y"])
-        net.Conv(
-            ["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
        net.Relu(["Y2"], ["Y2"])
-        addNNPACK(net) # get the NNPACK engine
+        transformer.AddNNPACK(net)  # get the NNPACK engine
        assert str_compare(net.Proto().op[0].engine, "NNPACK")
-        fuseNNPACKConvRelu(net)
-        assert (len(net.Proto().op) == 2)
+        transformer.FuseNNPACKConvRelu(net)
+        assert len(net.Proto().op) == 2
        has_activation_arg = False
        for arg in net.Proto().op[0].arg:
            if str_compare(arg.name, "activation"):
@ -188,14 +162,12 @@ class TestTransformations(test_util.TestCase):
        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]

-    def test_sinkMaxPool(self):
+    def test_transformer_SinkMaxPool(self):
        net = core.Net("net")
-        net.Conv(
-            ["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW"
-        )
+        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
        net.MaxPool(["Y"], ["Y1"], kernel=3)
        net.Relu(["Y1"], ["Y1"])
-        sinkMaxPool(net)
+        transformer.SinkMaxPool(net)
        assert str_compare(net.Proto().op[1].type, "Relu")
        assert str_compare(net.Proto().op[2].type, "MaxPool")

@ -204,9 +176,9 @@ class TestTransformations(test_util.TestCase):
        input_channels=st.integers(1, 10),
        seed=st.integers(0, 65535),
        order=st.sampled_from(["NCHW", "NHWC"]),
-        epsilon=st.floats(min_value=1e-5, max_value=1e-2)
+        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
    )
-    def test_fuseConvBN(self, size, input_channels, seed, order, epsilon):
+    def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon):
        net = core.Net("net")
        c = input_channels
        h = size
@ -214,31 +186,20 @@ class TestTransformations(test_util.TestCase):
        k = 3
        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=k, order=order)
        net.SpatialBN(
-            ["Y", "scale", "bias", "mean", "var"], ["Y2"],
+            ["Y", "scale", "bias", "mean", "var"],
+            ["Y2"],
            is_test=True,
            order=order,
-            epsilon=epsilon
+            epsilon=epsilon,
        )

        np.random.seed(seed)
        if order == "NCHW":
-            workspace.FeedBlob(
-                "X",
-                np.random.rand(1, c, h, w).astype(np.float32)
-            )
-            workspace.FeedBlob(
-                "w",
-                np.random.rand(c, c, k, k).astype(np.float32)
-            )
+            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
        else:
-            workspace.FeedBlob(
-                "X",
-                np.random.rand(1, h, w, c).astype(np.float32)
-            )
-            workspace.FeedBlob(
-                "w",
-                np.random.rand(c, k, k, c).astype(np.float32)
-            )
+            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
        workspace.FeedBlob("b", np.random.rand(c).astype(np.float32))
        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
@ -246,11 +207,13 @@ class TestTransformations(test_util.TestCase):
        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32))
        workspace.RunNetOnce(net)
        preTransformOutput = workspace.FetchBlob("Y2")
-        fuseConvBN(net)
+        transformer.FuseConvBN(net)

        # Ensure fusion
-        assert (len(net.Proto().op) == 1)
+        assert len(net.Proto().op) == 1
        workspace.RunNetOnce(net)
        postTransformOutput = workspace.FetchBlob("Y2")
        # Check that there is no numerical difference
-        assert (np.allclose(preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08))
+        assert np.allclose(
+            preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08
+        )
--- a/caffe2/queue/rebatching_queue.cc
+++ b/caffe2/queue/rebatching_queue.cc
@ -163,7 +163,7 @@ bool RebatchingQueue::enqueueOne(
  auto& tensorVector = splittedInputs.back();
  tensorVector.reserve(inputs.size());
  for (const auto* tensorPtr : inputs) {
-    tensorVector.push_back(*tensorPtr);
+    tensorVector.push_back(tensorPtr->Clone());
  }

  return enqueue(std::move(splittedInputs));
--- a/caffe2/sgd/yellowfin_op.h
+++ b/caffe2/sgd/yellowfin_op.h
@ -111,19 +111,19 @@ class YellowFinOp final : public Operator<Context> {
  bool RunOnDevice() override {
 // Iter live on the CPU

-#define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME)  \
-  const auto VAR_NAME##_tensor = Input(INPUT_NAME); \
+#define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME)   \
+  const auto& VAR_NAME##_tensor = Input(INPUT_NAME); \
  VAR_NAME##_ = VAR_NAME##_tensor.template data<T>();

-    CAFFE2_YF_READ_INPUT(PARAM, param)
-    CAFFE2_YF_READ_INPUT(MOMENT, moment)
-    CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
-    CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
-    CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
-    CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
-    CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
-    CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
-    CAFFE2_YF_READ_INPUT(GRAD, grad)
+CAFFE2_YF_READ_INPUT(PARAM, param)
+CAFFE2_YF_READ_INPUT(MOMENT, moment)
+CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
+CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
+CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
+CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
+CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
+CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
+CAFFE2_YF_READ_INPUT(GRAD, grad)
 #undef CAFFE2_YF_READ_OUTPUT

    CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
--- a/caffe2/release-notes.md
+++ b/caffe2/release-notes.md
--- a/rsync_exclude.txt
+++ b/rsync_exclude.txt
@ -0,0 +1,29 @@
+# To do syncs, check out caffe2 under ~/local, check out the fbsync branch,
+# and then execute
+#   rsync -arv --delete --exclude-from=rsync_exclude.txt ./ ~/local/caffe2/
+# Make sure you do dry run before actually doing anything.
+
+.git
+caffe/
+caffe2/fb/
+caffe2/experiments/
+third_party/
+PLATFORM
+caffe2/proto/fb_protobuf.sh
+README.facebook
+rsync_exclude.txt
+TARGETS
+.gitmodules
+.ipynb_checkpoints
+*.tmp
+
+# These two files are created by patch commands and are not needed.
+*.orig
+*.rej
+
+# We have these two files under fbcode for convenience.
+caffe2/contrib/nervana/nervana_c_api.cu
+caffe2/contrib/nervana/nervana_c_api.h
+
+# We have decided to delay open-source the mobile engine of conv transpose.
+caffe2/operators/conv_transpose_op_mobile*
--- a/submodules/tbb-rev.txt
+++ b/submodules/tbb-rev.txt
@ -0,0 +1 @@
+Subproject commit 633b01ad27e012e1dc4e392c3230250d1f4967a4
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@ -340,10 +340,10 @@ TEST_CASE("integration/mnist", "[cuda]") {
  auto linear2 = model->add(Linear(50, 10), "linear2");

  auto forward = [&](torch::Tensor x) {
-    x = at::max_pool2d(conv1->forward(x), {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(conv1->forward(x), {2, 2})).clamp_min(0);
    x = conv2->forward(x);
    x = drop2d->forward(x);
-    x = at::max_pool2d(x, {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(x, {2, 2})).clamp_min(0);

    x = x.view({-1, 320});
    x = linear1->forward(x).clamp_min(0);
@ -377,10 +377,10 @@ TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
  auto linear2 = model->add(Linear(50, 10), "linear2");

  auto forward = [&](torch::Tensor x) {
-    x = at::max_pool2d(conv1->forward(x), {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(conv1->forward(x), {2, 2})).clamp_min(0);
    x = batchnorm2d->forward(x);
    x = conv2->forward(x);
-    x = at::max_pool2d(x, {2, 2}).relu();
+    x = std::get<0>(at::max_pool2d(x, {2, 2})).clamp_min(0);

    x = x.view({-1, 320});
    x = linear1->forward(x).clamp_min(0);
--- a/test/test_distributed_trap.py
+++ b/test/test_distributed_trap.py
@ -0,0 +1,23 @@
+import os
+import tempfile
+import sys
+import random
+import __test_main__
+
+tmp_dir = tempfile.TemporaryDirectory()
+os.environ["TEMP_DIR"] = tmp_dir.name
+os.mkdir(os.path.join(tmp_dir.name, "barrier"))
+os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
+init_dir_path = os.path.join(tmp_dir.name, "init_dir")
+os.mkdir(init_dir_path)
+init_method = os.environ.get('INIT_METHOD')
+if init_method is not None and init_method == "zeus":
+    os.environ['INIT_METHOD'] = 'zeus://unittest_' + \
+        str(random.randint(1, 1000000000000))
+else:
+    os.environ['INIT_METHOD'] = 'file://' + \
+        os.path.join(init_dir_path, 'shared_init_file')
+
+
+if __name__ == '__main__':
+    __test_main__.main(sys.argv)
--- a/third_party/nccl/CMakeLists.txt
+++ b/third_party/nccl/CMakeLists.txt
@ -7,13 +7,14 @@ ENDIF()

 include("${CMAKE_UTILS_PATH}")
 torch_cuda_get_nvcc_gencode_flag(NVCC_GENCODE)
-string(REPLACE "-gencode;" "-gencode=" NVCC_GENCODE "${NVCC_GENCODE}")
-message(STATUS "Set NVCC_GENCODE for building NCCL: ${NVCC_GENCODE}")
+string (REPLACE ";" " " NVCC_GENCODE "${NVCC_GENCODE}")
+string (REPLACE "-gencode " "-gencode=" NVCC_GENCODE "${NVCC_GENCODE}")
+message(INFO "Set NVCC_GENCODE for building NCCL: ${NVCC_GENCODE}")

 ADD_CUSTOM_COMMAND(
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so
-   COMMAND env CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} NVCC=${CUDA_NVCC_EXECUTABLE} BUILDDIR=${CMAKE_CURRENT_BINARY_DIR} NVCC_GENCODE="${NVCC_GENCODE}" make -j${NUM_JOBS}
+   COMMAND env CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} NVCC=${CUDA_NVCC_EXECUTABLE} BUILDDIR=${CMAKE_CURRENT_BINARY_DIR} NVCC_GENCODE="${NVCC_GENCODE}" make -j `getconf _NPROCESSORS_ONLN`
 )

 ADD_CUSTOM_TARGET(nccl ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so)
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@ -915,11 +915,11 @@
 - name: fractional_max_pool2d_forward(Tensor self, IntList kernel_size, IntList output_size, Tensor random_samples)
  self: fractional_max_pool2d_backward(grad, self, kernel_size, output_size, indices)

- name: max_pool2d_with_indices_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
-  self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
+- name: max_pool2d_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
+  self: max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)

- name: max_pool3d_with_indices_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
-  self: max_pool3d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)
+- name: max_pool3d_forward(Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode)
+  self: max_pool3d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, indices)

 - name: max_unpool2d_forward(Tensor self, Tensor indices, IntList output_size)
  self: max_unpool2d_backward(grad, self, indices, output_size)
@ -1041,11 +1041,11 @@
  grad_output: leaky_relu_backward(grad, self, negative_slope)
  self: zeros_like(grad)

- name: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
+- name: max_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
  grad_output: max_pool_double_backward(grad, indices, 2);
  self: zeros_like(self)

- name: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
+- name: max_pool3d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList stride, IntList padding, IntList dilation, bool ceil_mode, Tensor indices)
  grad_output: max_pool_double_backward(grad, indices, 3);
  self: zeros_like(self)

--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@ -25,7 +25,7 @@ SKIP_PYTHON_BINDINGS = [
    'index',
    '_indexCopy_', 'max_values', 'min_values', 'argmax', 'argmin',
    '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_sum.*', '_th_prod.*',
-    'arange.*', 'range.*', '_gesv.*', 'slice', 'max_pool1d', 'max_pool2d', 'max_pool3d'
+    'arange.*', 'range.*', '_gesv.*', 'slice',
 ]

 PY_VARIABLE_METHOD_VARARGS = CodeTemplate("""\
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@ -37,6 +37,7 @@ outputs = [
    'torch/csrc/autograd/generated/python_nn_functions_dispatch.h',
    'torch/csrc/autograd/generated/python_variable_methods.cpp',
    'torch/csrc/autograd/generated/python_variable_methods_dispatch.h',
+    'torch/csrc/autograd/generated/variable_factories.h',
    'torch/csrc/autograd/generated/VariableType.cpp',
    'torch/csrc/autograd/generated/VariableType.h',
    'torch/csrc/jit/generated/aten_dispatch.cpp',
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@ -341,7 +341,7 @@ def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1,

    See :class:`~torch.nn.MaxPool1d` for details.
    """
-    ret = torch.max_pool1d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    ret = torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)
    return ret if return_indices else ret[0]


@ -352,7 +352,7 @@ def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,

    See :class:`~torch.nn.MaxPool2d` for details.
    """
-    ret = torch._C._nn.max_pool2d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    ret = torch._C._nn.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
    return ret if return_indices else ret[0]


@ -363,7 +363,7 @@ def max_pool3d(input, kernel_size, stride=None, padding=0, dilation=1,

    See :class:`~torch.nn.MaxPool3d` for details.
    """
-    ret = torch._C._nn.max_pool3d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+    ret = torch._C._nn.max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode)
    return ret if return_indices else ret[0]


--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@ -396,11 +396,11 @@ def softplus(g, self, beta, threshold):
    return g.op('Softplus', self)


-def max_pool1d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+def max_pool1d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
    if ceil_mode:
-        return _unimplemented("max_pool1d_with_indices", "ceil_mode")
+        return _unimplemented("max_pool1d", "ceil_mode")
    if set(_single(dilation)) != {1}:
-        return _unimplemented("max_pool1d_with_indices", "dilation")
+        return _unimplemented("max_pool1d", "dilation")
    if stride is None:
        stride = kernel_size
    r = g.op("MaxPool", input,
@ -410,11 +410,11 @@ def max_pool1d_with_indices(g, input, kernel_size, stride, padding, dilation, ce
    return r, None


-def max_pool2d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+def max_pool2d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
    if ceil_mode:
-        return _unimplemented("max_pool2d_with_indices", "ceil_mode")
+        return _unimplemented("max_pool2d", "ceil_mode")
    if set(_pair(dilation)) != {1}:
-        return _unimplemented("max_pool2d_with_indices", "dilation")
+        return _unimplemented("max_pool2d", "dilation")
    if not stride:
        stride = kernel_size
    r = g.op("MaxPool", input,
@ -424,11 +424,11 @@ def max_pool2d_with_indices(g, input, kernel_size, stride, padding, dilation, ce
    return r, None


-def max_pool3d_with_indices(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+def max_pool3d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
    if ceil_mode:
-        return _unimplemented("max_pool3d_with_indices", "ceil_mode")
+        return _unimplemented("max_pool3d", "ceil_mode")
    if set(_triple(dilation)) != {1}:
-        return _unimplemented("max_pool3d_with_indices", "dilation")
+        return _unimplemented("max_pool3d", "dilation")
    if not stride:
        stride = kernel_size
    r = g.op("MaxPool", input,
				`@ -0,0 +1 @@`
				`Subproject commit 633b01ad27e012e1dc4e392c3230250d1f4967a4`