Add quantized INT8 detection-postprocess op pass to the converter (#777)

CNugteren · web-flow · commit 43e197b16a30 · 2023-07-11T14:14:30.000Z
diff --git a/larq_compute_engine/mlir/BUILD b/larq_compute_engine/mlir/BUILD
@@ -394,6 +394,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "detection_postprocess_transform",
+    srcs = [
+        "transforms/detection_postprocess.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    deps = [
+        "//larq_compute_engine/mlir:larq_compute_engine",
+        "@llvm-project//mlir:FuncDialect",
+        "@org_tensorflow//tensorflow/compiler/mlir/lite:tensorflow_lite",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "fuse_padding",
     srcs = [
@@ -433,6 +449,7 @@ cc_library(
         "tf_tfl_passes.h",
     ],
     deps = [
+        ":detection_postprocess_transform",
         ":fuse_padding",
         ":larq_compute_engine_bitpack_weights",
         ":larq_compute_engine_legalize_tflite",
diff --git a/larq_compute_engine/mlir/tests/BUILD b/larq_compute_engine/mlir/tests/BUILD
@@ -14,6 +14,13 @@ lce_lit_test_suite(
     ],
 )
 
+test_suite(
+    name = "all",
+    tests = [
+        ":lit",
+    ],
+)
+
 cc_test(
     name = "lce_ops_options_test",
     srcs = ["lce_ops_options_test.cc"],
diff --git a/larq_compute_engine/mlir/tests/detection_postprocess.mlir b/larq_compute_engine/mlir/tests/detection_postprocess.mlir
@@ -0,0 +1,14 @@
+// RUN: lce-tf-opt %s -detection-postprocess-int -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: detection_postprocess_int
+func.func @detection_postprocess_int(%arg0: tensor<1x10x4x!quant.uniform<i8:f32, 2.343750e-02>>, %arg1: tensor<1x10x1x!quant.uniform<i8:f32, 2.343750e-02>>, %arg2: tensor<10x4x!quant.uniform<i8:f32, 2.343750e-02>>) -> (tensor<1x20x4xi32>, tensor<1x20xi32>, tensor<1x20xf32>, tensor<1xi32>) {
+  %0 = "tfl.dequantize"(%arg0) : (tensor<1x10x4x!quant.uniform<i8:f32, 2.343750e-02>>) -> tensor<1x10x4xf32>
+  %1 = "tfl.dequantize"(%arg1) : (tensor<1x10x1x!quant.uniform<i8:f32, 2.343750e-02>>) -> tensor<1x10x1xf32>
+  %2 = "tfl.dequantize"(%arg2) : (tensor<10x4x!quant.uniform<i8:f32, 2.343750e-02>>) -> tensor<10x4xf32>
+  %3:4 = "tfl.custom"(%0, %1, %2) {custom_code = "TFLite_Detection_PostProcess", custom_option = #tfl<const_bytes : "0x6D61785F646574656374696F6E73006D61785F636C61737365735F7065725F646574656374696F6E006E756D5F636C6173736573006E6D735F73636F72655F7468726573686F6C64006E6D735F696F755F7468726573686F6C6400795F7363616C6500785F7363616C6500685F7363616C6500775F7363616C65007573655F726567756C61725F6E6D73000A217E8E465B681720313A00000C000000010000000A0000000000803F01000000140000000000003F9A9959BF01000000010000000000803F0000803F0000803F0E06060E0E06060E0E0E322601">} : (tensor<1x10x4xf32>, tensor<1x10x1xf32>, tensor<10x4xf32>) -> (tensor<1x20x4xi32>, tensor<1x20xi32>, tensor<1x20xf32>, tensor<1xi32>)
+  return %3#0, %3#1, %3#2, %3#3 : tensor<1x20x4xi32>, tensor<1x20xi32>, tensor<1x20xf32>, tensor<1xi32>  // boxes, classes, scores, num_detections
+
+  // CHECK: %3:4 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "TFLite_Detection_PostProcess", custom_option = #tfl<const_bytes : "0x6D61785F646574656374696F6E73006D61785F636C61737365735F7065725F646574656374696F6E006E756D5F636C6173736573006E6D735F73636F72655F7468726573686F6C64006E6D735F696F755F7468726573686F6C6400795F7363616C6500785F7363616C6500685F7363616C6500775F7363616C65007573655F726567756C61725F6E6D73000A217E8E465B681720313A00000C000000010000000A0000000000803F01000000140000000000003F9A9959BF01000000010000000000803F0000803F0000803F0E06060E0E06060E0E0E322601">} : (tensor<1x10x4x!quant.uniform<i8:f32, 2.343750e-02>>, tensor<1x10x1x!quant.uniform<i8:f32, 2.343750e-02>>, tensor<10x4x!quant.uniform<i8:f32, 2.343750e-02>>) -> (tensor<1x20x4xi32>, tensor<1x20xi32>, tensor<1x20x!quant.uniform<i8:f32, 2.343750e-02>>, tensor<1xi32>)
+  // CHECK-NEXT: %4 = "tfl.dequantize"(%3#2) : (tensor<1x20x!quant.uniform<i8:f32, 2.343750e-02>>) -> tensor<1x20xf32>
+  // CHECK-NEXT: return %3#0, %3#1, %4, %3#3 : tensor<1x20x4xi32>, tensor<1x20xi32>, tensor<1x20xf32>, tensor<1xi32>
+}
diff --git a/larq_compute_engine/mlir/tf_tfl_passes.cc b/larq_compute_engine/mlir/tf_tfl_passes.cc
@@ -25,10 +25,16 @@ const char kTFLiteDataLayout[] = "NHWC";
 namespace {
 void AddQuantizationPasses(const mlir::quant::QuantizationSpecs& quant_specs,
                            mlir::OpPassManager& pass_manager) {
+  // PrepareQuantizePass adds Quantize->Dequantize pairs at *every* float tensor
+  // even if it did not have a fake quantization node, or if that fake
+  // quantization node was folded away (which would happen for weights).
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreatePrepareQuantizePass(quant_specs));
+
+  // The LCEQuantizePass is similar to 'step 1' of the TFL QuantizePass below.
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreateLCEQuantizePass());
+
   if (quant_specs.default_ranges.first.hasValue() ||
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager.addNestedPass<mlir::func::FuncOp>(
@@ -39,6 +45,18 @@ void AddQuantizationPasses(const mlir::quant::QuantizationSpecs& quant_specs,
     pass_manager.addNestedPass<mlir::func::FuncOp>(
         mlir::TFL::CreateLCEQuantizePass());
   }
+
+  // This absorbs Dequantize ops into the postprocessing op when possible,
+  // similar to 'step 1' of the TFL QuantizePass below.
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::TFL::QuantizeDetectionPostProcessPass());
+
+  // QuantizePass does two things:
+  // 1. For TFLite ops with quantize traits, the Dequantize is absorbed
+  //    into the input of the op, and in certain cases per-channel quantization
+  //    is applied.
+  // 2. Afterwards, any remaining Quantize->Dequantize pairs with constant input
+  //    are *removed*.
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreateQuantizePass());
   bool emit_quant_adaptor_ops =
diff --git a/larq_compute_engine/mlir/transforms/detection_postprocess.cc b/larq_compute_engine/mlir/transforms/detection_postprocess.cc
@@ -0,0 +1,186 @@
+#include "larq_compute_engine/mlir/ir/lce_ops.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+struct DetectionPostProcess
+    : public PassWrapper<DetectionPostProcess,
+                         OperationPass<mlir::func::FuncOp>> {
+  llvm::StringRef getArgument() const final {
+    return "detection-postprocess-int";
+  }
+  llvm::StringRef getDescription() const final {
+    return "Make detection postprocessing op run with int8 input";
+  }
+  void runOnOperation() override;
+};
+
+struct RemoveDequantizeBeforePostProcess : public OpRewritePattern<CustomOp> {
+  using OpRewritePattern<CustomOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(CustomOp detection_op,
+                                PatternRewriter& rewriter) const override {
+    // ----------------- matching part -----------------
+
+    // Match the custom op code to 'TFLite_Detection_PostProcess'
+    auto custom_code = detection_op.custom_code().str();
+    if (custom_code != "TFLite_Detection_PostProcess") {
+      return rewriter.notifyMatchFailure(detection_op, [&](Diagnostic& diag) {
+        diag << "op 'tfl.custom' attribute 'custom_code' failed to satisfy "
+                "constraint: constant attribute TFLite_Detection_PostProcess";
+      });
+    }
+
+    // Check the number of inputs and outputs of the detection op
+    auto original_detection_inputs = detection_op.input();
+    if (original_detection_inputs.size() != 3) {
+      return rewriter.notifyMatchFailure(detection_op, [&](Diagnostic& diag) {
+        diag << "expected 3 inputs for the detection op";
+      });
+    }
+    auto original_detection_outputs = detection_op.output();
+    if (original_detection_outputs.size() != 4) {
+      return rewriter.notifyMatchFailure(detection_op, [&](Diagnostic& diag) {
+        diag << "expected 4 outputs for the original detection op";
+      });
+    }
+
+    // Match that dequantization happens just before the detection op
+    auto boxes_input_op = original_detection_inputs[0].getDefiningOp();
+    auto original_boxes_dequantize_op =
+        llvm::dyn_cast_or_null<DequantizeOp>(boxes_input_op);
+    if (!original_boxes_dequantize_op) {
+      return rewriter.notifyMatchFailure(detection_op, [&](Diagnostic& diag) {
+        diag << "expected dequantization before the op for the 'boxes' input";
+      });
+    }
+    auto scores_input_op = original_detection_inputs[1].getDefiningOp();
+    auto original_scores_dequantize_op =
+        llvm::dyn_cast_or_null<DequantizeOp>(scores_input_op);
+    if (!original_scores_dequantize_op) {
+      return rewriter.notifyMatchFailure(detection_op, [&](Diagnostic& diag) {
+        diag << "expected dequantization before the op for the 'scores' input";
+      });
+    }
+    auto anchors_input_op = original_detection_inputs[2].getDefiningOp();
+    auto original_anchors_dequantize_op =
+        llvm::dyn_cast_or_null<DequantizeOp>(anchors_input_op);
+    if (!original_anchors_dequantize_op) {
+      return rewriter.notifyMatchFailure(detection_op, [&](Diagnostic& diag) {
+        diag << "expected dequantization before the op for the 'anchors' input";
+      });
+    }
+
+    // Verify the output types of the current detection op:
+    // Output type #0: [int32] detection boxes (scaled by 2048)
+    // Output type #1: [int32] detection class IDs
+    // Output type #2: [float32] detection scores
+    // Output type #3: [int32] number of detections
+    auto output_data_types = SmallVector<Type, 4>{
+        rewriter.getIntegerType(32),
+        rewriter.getIntegerType(32),
+        rewriter.getF32Type(),
+        rewriter.getIntegerType(32),
+    };
+    for (auto i = 0; i < 4; ++i) {
+      auto original_type =
+          original_detection_outputs[i].getType().cast<ShapedType>();
+      auto original_data_type = original_type.getElementType();
+      if (output_data_types[i] != original_data_type) {
+        return rewriter.notifyMatchFailure(detection_op, [&](Diagnostic& diag) {
+          diag << "unexpected output type of the op";
+        });
+      }
+    }
+
+    // ----------------- re-write part -----------------
+
+    // Get the original inputs (before dequantization)
+    auto boxes_input = original_boxes_dequantize_op.input();
+    auto scores_input = original_scores_dequantize_op.input();
+    auto anchors_input = original_anchors_dequantize_op.input();
+
+    // Set the new detection inputs
+    auto new_detection_inputs =
+        SmallVector<Value, 3>{boxes_input, scores_input, anchors_input};
+
+    // Set the 4 outputs types [scores, classes, boxes, num_detections]:
+    // Output type #0: [int32] detection boxes (scaled by 2048)
+    // Output type #1: [int32] detection class IDs
+    // Output type #2: [int8 quantized] detection scores
+    // Output type #3: [int32] number of detections
+    // All as before, except for output #2 (float -> int8 quantized)
+    auto scores_type = scores_input.getType()
+                           .cast<ShapedType>()
+                           .getElementType()
+                           .cast<quant::UniformQuantizedType>();
+    const auto scores_zp = scores_type.getZeroPoint();
+    const auto scores_scale = scores_type.getScale();
+    output_data_types[2] = quant::UniformQuantizedType::get(
+        true, rewriter.getIntegerType(8), rewriter.getF32Type(), scores_scale,
+        scores_zp, -128, 127);
+
+    // Set for all the outputs: data-type (as set above) and shape (as before)
+    auto new_op_output_types = SmallVector<Type, 4>{};
+    for (auto i = 0; i < 4; ++i) {
+      auto value = original_detection_outputs[i];
+      auto shape = value.getType().cast<ShapedType>().getShape();
+      auto new_output_type = RankedTensorType::get(shape, output_data_types[i]);
+      new_op_output_types.push_back(new_output_type);
+    }
+
+    // Add a new detection op (with int8 input and int8/int32 output)
+    auto new_detection_op = rewriter.create<CustomOp>(
+        detection_op->getLoc(), new_op_output_types, new_detection_inputs,
+        std::string{"TFLite_Detection_PostProcess"},
+        detection_op.custom_option());
+
+    // Add the 4 outputs: boxes, classes, scores, num_detections
+    auto new_outputs = SmallVector<Value, 4>{};
+
+    // Output #0: [int32] detection boxes (scaled by 2048)
+    new_outputs.push_back(new_detection_op.output()[0]);
+
+    // Output #1: [int32] detection class IDs
+    new_outputs.push_back(new_detection_op.output()[1]);
+
+    // Output #2: [int8 quantized] detection scores
+    auto new_dequantize_op = rewriter.create<DequantizeOp>(
+        detection_op->getLoc(), original_detection_outputs[2].getType(),
+        new_detection_op.output()[2]);
+    new_outputs.push_back(new_dequantize_op.output());
+
+    // Output #3: [int32] number of detections
+    new_outputs.push_back(new_detection_op.output()[3]);
+
+    // Final re-write of the detection op with detection + quantization
+    rewriter.replaceOp(detection_op, new_outputs);
+    return success();
+  };
+};
+
+void DetectionPostProcess::runOnOperation() {
+  auto* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  auto func = getOperation();
+
+  patterns.add<RemoveDequantizeBeforePostProcess>(ctx);
+  (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+}
+
+// Creates an instance of the TensorFlow dialect DetectionPostProcess pass.
+std::unique_ptr<OperationPass<func::FuncOp>>
+QuantizeDetectionPostProcessPass() {
+  return std::make_unique<DetectionPostProcess>();
+}
+
+static PassRegistration<DetectionPostProcess> pass;
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/larq_compute_engine/mlir/transforms/passes.h b/larq_compute_engine/mlir/transforms/passes.h
@@ -29,6 +29,9 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateLCEQuantizePass();
 // Creates an instance of LegalizeLCE pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateLegalizeLCEPass();
 
+// Creates an instance of the TensorFlow dialect DetectionPostProcess pass.
+std::unique_ptr<OperationPass<func::FuncOp>> QuantizeDetectionPostProcessPass();
+
 // Creates an instance of the FusePadding pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateFusePaddingPass();