Enable loading int8 prepacked models in PredictorContainer

Summary: To test the int8 ads models on CPU and accelerators with the ads replayer, we need to load the PREPACKING_INIT_NET_TYPE in the int8 model to initialize the int8 w_packed blobs. Test Plan: Ads replayer test. P74811059 Reviewed By: zrphercule Differential Revision: D16518888 fbshipit-source-id: cee212710ad37d9e491c970b25b2fe484373e5e4
2025-11-03 07:24:58 +08:00 · 2019-09-06 02:51:32 -07:00
parent cc4211069e
commit d95763b4dc
4 changed files with 41 additions and 9 deletions
--- a/caffe2/opt/bound_shape_inferencer.cc
+++ b/caffe2/opt/bound_shape_inferencer.cc
@ -53,7 +53,7 @@ void BoundShapeInferencer::InferOps(
    InferSparseLengthsSum(op);
  } else if (
      op.type() == "FC" || op.type() == "FCTransposed" ||
-      op.type() == "FbFCPacked") {
+      op.type() == "FbFCPacked" || op.type() == "Int8FC") {
    InferFC(op);
  } else if (op.type() == "Concat") {
    InferConcat(op);
@ -424,12 +424,13 @@ void BoundShapeInferencer::InferFC(const OperatorDef& op) {
  const ShapeInfo& w_shape_info = w_it->second;
  const auto b_it = shape_info_.find(op.input(2));
  CAFFE_ENFORCE(
-      w_it != shape_info_.end(),
+      b_it != shape_info_.end(),
      "Shape of BIAS input of FC ",
      op.input(2),
      " needs to be presented");
  const ShapeInfo& b_shape_info = b_it->second;
  bool fp16 = (op.type() == "FbFCPacked");
+  bool int8_fc = (op.type() == "Int8FC" || op.engine() == "DNNLOWP");
  auto x_it = shape_info_.find(op.input(0));
  if (x_it == shape_info_.end()) {
    // We don't have a hint at the x input we try to deduce it from weight
@ -451,13 +452,21 @@ void BoundShapeInferencer::InferFC(const OperatorDef& op) {
    dims.push_back(K);
    current_dim_type_ = ShapeInfo::DimType::BATCH;
    current_max_batch_size_ = spec_.max_batch_size;
+    TensorProto::DataType w_data_type;
+    if (fp16) {
+      w_data_type = TensorProto_DataType_FLOAT;
+    } else if (int8_fc) {
+      w_data_type = TensorProto_DataType_UINT8;
+    } else {
+      w_data_type = w_shape.data_type();
+    }
    // Note: for FbFCPacked, weight is fp16 but actications are in fp32
    CheckAndSetTensorShapeAndType(
        op.input(0),
        ShapeInfo::DimType::BATCH,
        dims,
-        fp16 ? TensorProto_DataType_FLOAT : w_shape.data_type(),
-        false);
+        w_data_type,
+        int8_fc ? true : false);
  } else {
    ShapeInfo& x_shape_info = x_it->second;
    if (x_shape_info.dim_type != ShapeInfo::DimType::BATCH) {
@ -472,12 +481,20 @@ void BoundShapeInferencer::InferFC(const OperatorDef& op) {
      shape_info_[op.input(0)].shape, w_shape_info.shape, b_shape_info.shape};
  std::vector<TensorShape> output_shapes = InferOutput(op, input_shapes);
  CAFFE_ENFORCE_EQ(output_shapes.size(), 1);
+  TensorProto::DataType output_data_type;
+  if (fp16) {
+    output_data_type = TensorProto_DataType_FLOAT;
+  } else if (int8_fc) {
+    output_data_type = TensorProto_DataType_UINT8;
+  } else {
+    output_data_type = output_shapes[0].data_type();
+  }
  CheckAndSetTensorShapeAndType(
      op.output(0),
      ShapeInfo::DimType::BATCH,
      ConvertToVec(output_shapes[0].dims()),
-      fp16 ? TensorProto_DataType_FLOAT : output_shapes[0].data_type(),
-      false);
+      output_data_type,
+      int8_fc ? true : false);
 }

 void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
@ -511,7 +528,8 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
          {"Int8AveragePool", 0},
          {"Int8FC", 1},
          {"Int8Conv", 1},
-          {"Int8SumRelu", 0}};
+          {"Int8SumRelu", 0},
+          {"Int8Relu", 0}};
      CAFFE_ENFORCE(
          type_info_from_input.find(op.type()) != type_info_from_input.end(),
          "Undefined quantized output data type, add it into type_info_from_input");
--- a/caffe2/opt/custom/glow_net_transform.cc
+++ b/caffe2/opt/custom/glow_net_transform.cc
@ -125,7 +125,11 @@ void onnxifi(
      if (kv.size() == 2) {
        auto dims = caffe2::split(',', kv.back());
        TensorShape input;
-        input.set_data_type(TensorProto_DataType_FLOAT);
+        if (kv.front().find("int8") != std::string::npos) {
+          input.set_data_type(TensorProto_DataType_UINT8);
+        } else {
+          input.set_data_type(TensorProto_DataType_FLOAT);
+        }
        bool valid = true;
        for (const auto& d : dims) {
          try {
--- a/caffe2/quantization/server/fbgemm_pack_op.cc
+++ b/caffe2/quantization/server/fbgemm_pack_op.cc
@ -229,7 +229,9 @@ FullyConnectedDNNLowPPackWeightOp::FullyConnectedDNNLowPPackWeightOp(
    : DNNLowPOp<uint8_t, FCFp32Op>(operator_def, ws),
      axis_w_(this->GetSingleArgument<int32_t>("axis_w", 1)),
      quantize_channelwise_(
-          this->GetSingleArgument<bool>("quantize_channelwise", false)) {
+          this->GetSingleArgument<bool>("quantize_channelwise", false)),
+      save_unpacked_weights_(
+          this->GetSingleArgument<bool>("save_unpacked_weights", false)) {
  if (this->debug_def().engine() == "DNNLOWP_ROWWISE") {
    quantize_channelwise_ = true;
  }
@ -258,6 +260,13 @@ bool FullyConnectedDNNLowPPackWeightOp::RunOnDevice() {
  QuantizeWeight<uint8_t>(
      InputBlob(0), K, N, Y->qparams, W_quantized, qfactory_.get());

+  if (save_unpacked_weights_) {
+    ReinitializeTensor(
+        &Y->original_tensor, filter.sizes(), at::dtype<int8_t>().device(CPU));
+    auto* buffer = Y->original_tensor.template mutable_data<int8_t>();
+    CAFFE_ENFORCE_EQ(Y->original_tensor.numel(), W_quantized.size());
+    memcpy(buffer, W_quantized.data(), W_quantized.size() * sizeof(int8_t));
+  }
  if (this->InputIsType<int8::Int8TensorCPU>(0) && quantize_channelwise_) {
    static int log_occurences = 0;
    if (log_occurences < 32) {
--- a/caffe2/quantization/server/fbgemm_pack_op.h
+++ b/caffe2/quantization/server/fbgemm_pack_op.h
@ -24,6 +24,7 @@ class FullyConnectedDNNLowPPackWeightOp final
  int axis_w_;
  bool quantize_channelwise_;
  int nbits_in_non_outlier_; // only for DNNLOWP_ACC16
+  bool save_unpacked_weights_;

  INPUT_TAGS(FILTER, BIAS);
 };