Add bitpack order argument to packbits_tensor (#337)

Tombana · web-flow · commit 4b7499413cf4 · 2020-04-17T15:42:14.000+02:00
diff --git a/larq_compute_engine/core/packbits_utils.h b/larq_compute_engine/core/packbits_utils.h
@@ -24,7 +24,7 @@ int GetPackedTensorSize(const RuntimeShape& shape) {
 
 // Convenience function for bitpacking a tensor along its last dimension
 // and updating the tensor shape
-template <class T, class TBitpacked>
+template <BitpackOrder bitpack_order, class T, class TBitpacked>
 inline void packbits_tensor(const RuntimeShape& in_shape, const T* in_data,
                             const std::int32_t zero_point,
                             RuntimeShape& out_shape, TBitpacked* out_data) {
@@ -35,8 +35,8 @@ inline void packbits_tensor(const RuntimeShape& in_shape, const T* in_data,
 
   {
     gemmlowp::ScopedProfilingLabel label("Packbits");
-    ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
-        in_data, rows, cols, out_data, zero_point);
+    ce::core::packbits_matrix<bitpack_order>(in_data, rows, cols, out_data,
+                                             zero_point);
   }
 
   out_shape.ReplaceWith(dims, in_shape.DimsData());
diff --git a/larq_compute_engine/tflite/kernels/bconv2d.cc b/larq_compute_engine/tflite/kernels/bconv2d.cc
@@ -775,9 +775,9 @@ void EvalRef(TfLiteContext* context, TfLiteNode* node,
   } else {
     TfLiteTensor* packed_input =
         GetTemporary(context, node, params->packed_input_index);
-    ce::core::packbits_tensor(input_shape, input_data, input->params.zero_point,
-                              packed_input_shape,
-                              GetTensorData<TBitpacked>(packed_input));
+    ce::core::packbits_tensor<ce::core::BitpackOrder::Canonical>(
+        input_shape, input_data, input->params.zero_point, packed_input_shape,
+        GetTensorData<TBitpacked>(packed_input));
     packed_input_data = GetTensorData<TBitpacked>(packed_input);
   }
 
diff --git a/larq_compute_engine/tflite/kernels/bconv2d_impl.h b/larq_compute_engine/tflite/kernels/bconv2d_impl.h
@@ -145,8 +145,9 @@ inline void BConv2D(
     } else {
       // The input tensor has this shape which we bitpack along the channels
       // dimension [batch, input height, input width, channels].
-      ce::core::packbits_tensor(input_shape, input_data, params.input_offset,
-                                packed_input_shape, packed_input_data);
+      ce::core::packbits_tensor<ce::core::BitpackOrder::Optimized>(
+          input_shape, input_data, params.input_offset, packed_input_shape,
+          packed_input_data);
       im2col_input_data = packed_input_data;
     }
     im2col<TBitpacked>(params, packed_input_shape, im2col_input_data,
@@ -166,8 +167,9 @@ inline void BConv2D(
     // The RHS tensor has this shape which we bitpack along the last dimension
     //  [batch, output_height, output_width, k * bitwidth]
     RuntimeShape packed_input_shape;
-    ce::core::packbits_tensor(result_shape, result_data, params.input_offset,
-                              packed_input_shape, packed_input_data);
+    ce::core::packbits_tensor<ce::core::BitpackOrder::Optimized>(
+        result_shape, result_data, params.input_offset, packed_input_shape,
+        packed_input_data);
     rhs_data = packed_input_data;
 
     k = packed_input_shape.Dims(3);
diff --git a/larq_compute_engine/tflite/tests/bconv2d_test.cc b/larq_compute_engine/tflite/tests/bconv2d_test.cc
@@ -341,8 +341,9 @@ void set_lce_op_input(const RuntimeShape& input_shape,
   std::vector<std::int32_t> input_data_bp(
       core::GetPackedTensorSize<std::int32_t>(input_shape));
   RuntimeShape output_shape;
-  core::packbits_tensor(input_shape, input_data.data(), zero_point,
-                        output_shape, input_data_bp.data());
+  core::packbits_tensor<ce::core::BitpackOrder::Canonical>(
+      input_shape, input_data.data(), zero_point, output_shape,
+      input_data_bp.data());
   m_lce.SetInput(input_data_bp);
 }
 
@@ -358,8 +359,9 @@ void test_lce_op_output(const std::vector<std::int32_t>& lce_output_data,
   std::vector<std::int32_t> builtin_output_data_bp(
       core::GetPackedTensorSize<std::int32_t>(out_shape));
   RuntimeShape packed_shape;
-  core::packbits_tensor(out_shape, builtin_output_data.data(), zero_point,
-                        packed_shape, builtin_output_data_bp.data());
+  core::packbits_tensor<ce::core::BitpackOrder::Canonical>(
+      out_shape, builtin_output_data.data(), zero_point, packed_shape,
+      builtin_output_data_bp.data());
 
   // We need the outputs here to be bit-exact, so don't allow for floating
   // point imprecision.