larq
diff --git a/‎larq_compute_engine/core/packbits.h
Lines changed: 42 additions & 84 deletions b/‎larq_compute_engine/core/packbits.h
Lines changed: 42 additions & 84 deletions
diff --git a/‎larq_compute_engine/core/packbits_utils.h
Lines changed: 14 additions & 7 deletions b/‎larq_compute_engine/core/packbits_utils.h
Lines changed: 14 additions & 7 deletions
diff --git a/‎larq_compute_engine/core/tests/packbits_tests.cc
Lines changed: 15 additions & 110 deletions b/‎larq_compute_engine/core/tests/packbits_tests.cc
Lines changed: 15 additions & 110 deletions
diff --git a/‎larq_compute_engine/mlir/transforms/optimize.cc
Lines changed: 2 additions & 4 deletions b/‎larq_compute_engine/mlir/transforms/optimize.cc
Lines changed: 2 additions & 4 deletions
@@ -5,7 +5,6 @@
 #include <cstdint>
 #include <cstring>
 #include <limits>
-#include <vector>
 
 #include "larq_compute_engine/core/types.h"
 #ifdef __aarch64__
@@ -15,11 +14,30 @@
 namespace compute_engine {
 namespace core {
 
-#define GET_POINTER_TO_ROW(array_pointer, lda, row_index) \
-  ((array_pointer) + ((row_index) * (lda)))
+// Utility functions
+constexpr int GetPackedSize(int unpacked_elements, int bitwidth) {
+  return (unpacked_elements + bitwidth - 1) / bitwidth;
+}
+
+constexpr int GetPackedMatrixSize(int rows, int cols, int bitwidth) {
+  return rows * GetPackedSize(cols, bitwidth);
+}
+
+template <typename TBitpacked>
+constexpr int GetPackedSize(int unpacked_elements) {
+  return GetPackedSize(
+      unpacked_elements,
+      std::numeric_limits<
+          typename std::make_unsigned<TBitpacked>::type>::digits);
+}
 
-#define GET_ELEM_INDEX(row_index, col_index, lda) \
-  ((row_index) * (lda) + (col_index))
+template <typename TBitpacked>
+constexpr int GetPackedMatrixSize(int rows, int cols) {
+  return GetPackedMatrixSize(
+      rows, cols,
+      std::numeric_limits<
+          typename std::make_unsigned<TBitpacked>::type>::digits);
+}
 
 template <class TIn, class TOut>
 inline void pack_canonical(const TIn* fptr, TOut* buf) {
@@ -368,89 +386,29 @@ inline void packbits_array(const TIn* input_array, const std::size_t n,
   }
 }
 
-// input/output matrices are stored in row-major mode
-// bitpacking_axis argument specifies the dimension in matrix where
-// the bitpacking operation is performed. For example for a RowWise
-// bitpacking operation compresses an MxN matrix to a Mx(N/bitwidth)
-// matrix.
-template <BitpackOrder bitpack_order, class TIn, class TOutContainer>
-inline void packbits_matrix(const TIn* input_data,
-                            const std::size_t input_num_rows,
-                            const std::size_t input_num_cols,
-                            TOutContainer& output, std::size_t& output_num_rows,
-                            std::size_t& output_num_cols,
-                            std::size_t& output_bitpadding,
-                            const Axis bitpacking_axis,
+// Bitpacks each row of a row-major matrix
+template <BitpackOrder bitpack_order, class TIn, class TOut_>
+inline void packbits_matrix(const TIn* input, const std::size_t input_num_rows,
+                            const std::size_t input_num_cols, TOut_* output,
                             const std::int32_t zero_point = 0) {
   // Force the types to be unsigned so that the function can be called on signed
   // types as well
-  using TOut =
-      typename std::make_unsigned<typename TOutContainer::value_type>::type;
-  const std::size_t bitwidth = std::numeric_limits<TOut>::digits;
-
-  if (bitpacking_axis == Axis::RowWise) {
-    // calculate size of bitpacked matrix and allocate its memory
-    output_num_rows = input_num_rows;
-    output_num_cols = (input_num_cols + bitwidth - 1) / bitwidth;
-
-    const auto output_size = output_num_cols * output_num_rows;
-    output.resize(output_size);
-
-    const auto input_row_size = input_num_cols;
-    const auto output_row_size = output_num_cols;
-
-    const auto num_extra_elems = input_num_cols % bitwidth;
-    output_bitpadding = num_extra_elems == 0 ? 0 : bitwidth - num_extra_elems;
-
-    // iterate through each row of the input matrix and bitpack the row into
-    // the corresponding memory location of the output matrix
-    TOut* output_data = reinterpret_cast<TOut*>(output.data());
-    for (size_t row_index = 0; row_index < input_num_rows; ++row_index)
-      packbits_array<bitpack_order>(
-          GET_POINTER_TO_ROW(input_data, input_row_size, row_index),
-          input_row_size,
-          GET_POINTER_TO_ROW(output_data, output_row_size, row_index),
-          zero_point);
-    return;
-  }
-
-  if (bitpacking_axis == Axis::ColWise) {
-    // calculate size of bitpacked matrix and allocate its memory
-    output_num_rows = (input_num_rows + bitwidth - 1) / bitwidth;
-    output_num_cols = input_num_cols;
-    const auto output_size = output_num_cols * output_num_rows;
-    output.resize(output_size);
-
-    const auto input_row_size = input_num_cols;
-    const auto output_row_size = output_num_cols;
-
-    const auto num_extra_elems = input_num_rows % bitwidth;
-    output_bitpadding = num_extra_elems == 0 ? 0 : bitwidth - num_extra_elems;
-
-    // allocate temporary buffers
-    std::vector<TIn> input_buffer(input_num_rows);
-    std::vector<TOut> output_buffer(output_num_rows);
-
-    TOut* output_data = reinterpret_cast<TOut*>(output.data());
-
-    // iterate through the columns
-    for (size_t col_index = 0; col_index < input_num_cols; ++col_index) {
-      // store the values of the current column in a buffer
-      for (size_t row_index = 0; row_index < input_num_rows; ++row_index)
-        input_buffer[row_index] =
-            input_data[GET_ELEM_INDEX(row_index, col_index, input_row_size)];
-
-      // bitpack the buffer and store it in the the output matrix
-      packbits_array<bitpack_order>(input_buffer.data(), input_buffer.size(),
-                                    output_buffer.data(), zero_point);
-
-      // store the bitpacked values of the current column in the output matrix
-      for (size_t row_index = 0; row_index < output_num_rows; ++row_index)
-        output_data[GET_ELEM_INDEX(row_index, col_index, output_row_size)] =
-            output_buffer[row_index];
-    }
-    return;
+  using TOut = typename std::make_unsigned<TOut_>::type;
+
+  // Calculate the size of the bitpacked rows
+  const std::size_t output_num_cols = GetPackedSize<TOut>(input_num_cols);
+
+  // Iterate through each row of the input matrix and bitpack the row into the
+  // corresponding memory location of the output matrix
+  const TIn* input_ptr = input;
+  TOut* output_ptr = reinterpret_cast<TOut*>(output);
+  for (size_t row_index = 0; row_index < input_num_rows; ++row_index) {
+    packbits_array<bitpack_order>(input_ptr, input_num_cols, output_ptr,
+                                  zero_point);
+    input_ptr += input_num_cols;
+    output_ptr += output_num_cols;
   }
+  return;
 }
 
 template <typename TBitpacked, typename TUnpacked>
 
@@ -11,29 +11,36 @@ namespace compute_engine {
 namespace ce = compute_engine;
 namespace core {
 
+template <typename TBitpacked>
+int GetPackedTensorSize(const RuntimeShape& shape) {
+  constexpr auto bitwidth = std::numeric_limits<
+      typename std::make_unsigned<TBitpacked>::type>::digits;
+  const int dims = shape.DimensionsCount();
+  // Pack the tensor along the last dimension
+  const int rows = FlatSizeSkipDim(shape, dims - 1);
+  const int cols = shape.Dims(dims - 1);
+  return ce::core::GetPackedMatrixSize(rows, cols, bitwidth);
+}
+
 // Convenience function for bitpacking a tensor along its last dimension
 // and updating the tensor shape
 template <class T, class TBitpacked>
 inline void packbits_tensor(const RuntimeShape& in_shape, const T* in_data,
                             const std::int32_t zero_point,
-                            RuntimeShape& out_shape,
-                            std::vector<TBitpacked>& out_data) {
+                            RuntimeShape& out_shape, TBitpacked* out_data) {
   const int dims = in_shape.DimensionsCount();
   // Pack the tensor along the last dimension
   const int rows = FlatSizeSkipDim(in_shape, dims - 1);
   const int cols = in_shape.Dims(dims - 1);
 
-  std::size_t rows_bp = 0, cols_bp = 0;
-  std::size_t bitpadding = 0;
   {
     gemmlowp::ScopedProfilingLabel label("Packbits");
     ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
-        in_data, rows, cols, out_data, rows_bp, cols_bp, bitpadding,
-        ce::core::Axis::RowWise, zero_point);
+        in_data, rows, cols, out_data, zero_point);
   }
 
   out_shape.ReplaceWith(dims, in_shape.DimsData());
-  out_shape.SetDim(dims - 1, cols_bp);
+  out_shape.SetDim(dims - 1, GetPackedSize<TBitpacked>(cols));
 }
 
 // Convenience function for going from a shape to the packed shape
 
@@ -16,9 +16,7 @@ namespace ce = compute_engine;
 // operation itself while the uniform tests are used to verify the bitpacking
 // with different bitwidths
 template <class TIn>
-void test_bitpacking_nonuniform_input_rowwise() {
-  const auto bitpacking_axis = ce::core::Axis::RowWise;
-
+void test_bitpacking_nonuniform_input() {
   // input matrix (row-major memory laytout)
   const std::size_t num_rows = 2, num_cols = 8;
   std::array<TIn, 16> input{1, 1,  -1, 1,  -1, -1, -1, 1,
@@ -32,164 +30,71 @@ void test_bitpacking_nonuniform_input_rowwise() {
   else
     expected = {0b01110100, 0b00111010};
 
-  std::vector<std::uint8_t> output;
-  std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
-  ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
-      input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
-      bitpadding, bitpacking_axis);
-
-  EXPECT_EQ(num_rows_bp, expected_num_rows);
-  EXPECT_EQ(num_cols_bp, expected_num_cols);
-  EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
-  EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
-}
-
-template <class TIn>
-void test_bitpacking_nonuniform_input_colwise() {
-  const auto bitpacking_axis = ce::core::Axis::ColWise;
-
-  // input matrix (row-major memory laytout)
-  const std::size_t num_rows = 8, num_cols = 2;
-  std::array<TIn, 16> input{1,  1,  1,  -1, -1, 1, 1, -1,
-                            -1, -1, -1, -1, -1, 1, 1, 1};
-
-  // expected output matrix after bitpacking
-  const std::size_t expected_num_rows = 1, expected_num_cols = 2;
-  std::vector<std::uint8_t> expected;
-  if (CE_IS_BIG_ENDIAN)
-    expected = {0b00101110, 0b01011100};
-  else
-    expected = {0b01110100, 0b00111010};
-
-  std::vector<std::uint8_t> output;
-  std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
+  std::vector<std::uint8_t> output(
+      ce::core::GetPackedMatrixSize<std::uint8_t>(num_rows, num_cols));
   ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
-      input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
-      bitpadding, bitpacking_axis);
+      input.data(), num_rows, num_cols, output.data());
 
-  EXPECT_EQ(num_rows_bp, expected_num_rows);
-  EXPECT_EQ(num_cols_bp, expected_num_cols);
-  EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
   EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
 }
 
 template <class TIn, class TOut, std::size_t num_rows, std::size_t num_cols>
-void test_bitpacking(const ce::core::Axis bitpacking_axis,
-                     const std::size_t expected_num_rows,
+void test_bitpacking(const std::size_t expected_num_rows,
                      const std::size_t expected_num_cols) {
   const std::size_t bitwidth = std::numeric_limits<TOut>::digits;
 
   const std::size_t num_elems = num_rows * num_cols;
   std::array<TIn, num_elems> input;
   input.fill(-1);
 
-  std::vector<TOut> output;
-  std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
+  std::vector<TOut> output(
+      ce::core::GetPackedMatrixSize<TOut>(num_rows, num_cols));
   ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
-      input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
-      bitpadding, bitpacking_axis);
+      input.data(), num_rows, num_cols, output.data());
 
   TOut expected_value = std::numeric_limits<TOut>::max();
   const std::size_t num_elems_bp = num_elems / bitwidth;
   std::array<TOut, num_elems_bp> expected;
   expected.fill(expected_value);
 
-  EXPECT_EQ(num_rows_bp, expected_num_rows);
-  EXPECT_EQ(num_cols_bp, expected_num_cols);
-  EXPECT_EQ(bitpadding, 0);
-  EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
   EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
 }
 
 TEST(BitpackingTests, BitpackingRowMajorUInt8NonUniformInput) {
-  test_bitpacking_nonuniform_input_rowwise<float>();
-}
-
-TEST(BitpackingTests, BitpackingColMajorUInt8NonUniformInput) {
-  test_bitpacking_nonuniform_input_colwise<float>();
+  test_bitpacking_nonuniform_input<float>();
 }
 
 TEST(BitpackingTests, BitpackingRowMajorUInt8) {
-  test_bitpacking<float, std::uint8_t, 2, 128>(ce::core::Axis::RowWise, 2, 16);
+  test_bitpacking<float, std::uint8_t, 2, 128>(2, 16);
 }
 
 TEST(BitpackingTests, BitpackingRowMajorUInt32) {
-  test_bitpacking<float, std::uint32_t, 2, 128>(ce::core::Axis::RowWise, 2, 4);
+  test_bitpacking<float, std::uint32_t, 2, 128>(2, 4);
 }
 
 TEST(BitpackingTests, BitpackingRowMajorUInt64) {
-  test_bitpacking<float, std::uint64_t, 2, 128>(ce::core::Axis::RowWise, 2, 2);
-}
-
-TEST(BitpackingTests, BitpackingColumnMajorUInt8) {
-  test_bitpacking<float, std::uint8_t, 128, 2>(ce::core::Axis::ColWise, 16, 2);
-}
-
-TEST(BitpackingTests, BitpackingColumnMajorUInt32) {
-  test_bitpacking<float, std::uint32_t, 128, 2>(ce::core::Axis::ColWise, 4, 2);
-}
-
-TEST(BitpackingTests, BitpackingColumnMajorUInt64) {
-  test_bitpacking<float, std::uint64_t, 128, 2>(ce::core::Axis::ColWise, 2, 2);
+  test_bitpacking<float, std::uint64_t, 2, 128>(2, 2);
 }
 
 TEST(BitpackingWithBitPaddingTests, RowMajorPadding) {
-  const auto bitpacking_axis = ce::core::Axis::RowWise;
   // input matrix
   const int num_rows = 2;
   const int num_cols = 9;
   std::vector<float> input{-1, -1, 1,  -1, 1, 1, 1,  -1, -1,
                            -1, 1,  -1, 1,  1, 1, -1, -1, 1};
 
   // expected output matrix after bitpacking
-  const std::size_t expected_num_rows = 2;
-  const std::size_t expected_num_cols = 2;
   std::vector<std::uint8_t> expected;
   if (CE_IS_BIG_ENDIAN)
     expected = {0b11010001, 0b10000000, 0b10100011, 0b00000000};
   else
     expected = {0b10001011, 0b00000001, 0b11000101, 0b00000000};
 
-  std::vector<std::uint8_t> output;
-  std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
-  ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
-      input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
-      bitpadding, bitpacking_axis);
-
-  EXPECT_EQ(num_rows_bp, expected_num_rows);
-  EXPECT_EQ(num_cols_bp, expected_num_cols);
-  EXPECT_EQ(bitpadding, 7);
-  EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
-  EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
-};
-
-TEST(BitpackingWithBitPaddingTests, ColMajorPadding) {
-  const auto bitpacking_axis = ce::core::Axis::ColWise;
-  // The input matrix is:
-  const int num_rows = 9;
-  const int num_cols = 2;
-  std::vector<float> input{-1, -1, -1, 1, 1,  -1, -1, 1,  1,
-                           1,  1,  1,  1, -1, -1, -1, -1, 1};
-
-  // expected output matrix after bitpacking
-  const std::size_t expected_num_rows = 2;
-  const std::size_t expected_num_cols = 2;
-  std::vector<std::uint8_t> expected;
-  if (CE_IS_BIG_ENDIAN)
-    expected = {0b11010001, 0b10100011, 0b10000000, 0b00000000};
-  else
-    expected = {0b10001011, 0b11000101, 0b00000001, 0b00000000};
-
-  std::vector<std::uint8_t> output;
-  std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
+  std::vector<std::uint8_t> output(
+      ce::core::GetPackedMatrixSize<std::uint8_t>(num_rows, num_cols));
   ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
-      input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
-      bitpadding, bitpacking_axis);
+      input.data(), num_rows, num_cols, output.data());
 
-  EXPECT_EQ(num_rows_bp, expected_num_rows);
-  EXPECT_EQ(num_cols_bp, expected_num_cols);
-  EXPECT_EQ(bitpadding, 7);
-  EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
   EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
 };
 
 
@@ -45,11 +45,9 @@ DenseElementsAttr Bitpack(PatternRewriter& builder, Attribute x) {
   std::vector<PackedType> new_values(num_rows * packed_channels);
 
   const float* in_ptr = &(*dense_elements_iter.begin());
-  std::size_t filter_rows_bp, filter_cols_bp, filter_bitpadding;
   using namespace compute_engine::core;
-  packbits_matrix<BitpackOrder::Canonical>(
-      in_ptr, num_rows, unpacked_channels, new_values, filter_rows_bp,
-      filter_cols_bp, filter_bitpadding, Axis::RowWise);
+  packbits_matrix<BitpackOrder::Canonical>(in_ptr, num_rows, unpacked_channels,
+                                           new_values.data());
 
   RankedTensorType out_tensor_type =
       RankedTensorType::get({shape[0], shape[1], shape[2], packed_channels},