Skip to content

Commit b013bdc

Browse files
authored
Remove bitpacking dynamic memory allocations (#336)
1 parent b012df1 commit b013bdc

File tree

8 files changed

+188
-262
lines changed

8 files changed

+188
-262
lines changed

larq_compute_engine/core/packbits.h

Lines changed: 42 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <cstdint>
66
#include <cstring>
77
#include <limits>
8-
#include <vector>
98

109
#include "larq_compute_engine/core/types.h"
1110
#ifdef __aarch64__
@@ -15,11 +14,30 @@
1514
namespace compute_engine {
1615
namespace core {
1716

18-
#define GET_POINTER_TO_ROW(array_pointer, lda, row_index) \
19-
((array_pointer) + ((row_index) * (lda)))
17+
// Utility functions
18+
constexpr int GetPackedSize(int unpacked_elements, int bitwidth) {
19+
return (unpacked_elements + bitwidth - 1) / bitwidth;
20+
}
21+
22+
constexpr int GetPackedMatrixSize(int rows, int cols, int bitwidth) {
23+
return rows * GetPackedSize(cols, bitwidth);
24+
}
25+
26+
template <typename TBitpacked>
27+
constexpr int GetPackedSize(int unpacked_elements) {
28+
return GetPackedSize(
29+
unpacked_elements,
30+
std::numeric_limits<
31+
typename std::make_unsigned<TBitpacked>::type>::digits);
32+
}
2033

21-
#define GET_ELEM_INDEX(row_index, col_index, lda) \
22-
((row_index) * (lda) + (col_index))
34+
template <typename TBitpacked>
35+
constexpr int GetPackedMatrixSize(int rows, int cols) {
36+
return GetPackedMatrixSize(
37+
rows, cols,
38+
std::numeric_limits<
39+
typename std::make_unsigned<TBitpacked>::type>::digits);
40+
}
2341

2442
template <class TIn, class TOut>
2543
inline void pack_canonical(const TIn* fptr, TOut* buf) {
@@ -368,89 +386,29 @@ inline void packbits_array(const TIn* input_array, const std::size_t n,
368386
}
369387
}
370388

371-
// input/output matrices are stored in row-major mode
372-
// bitpacking_axis argument specifies the dimension in matrix where
373-
// the bitpacking operation is performed. For example for a RowWise
374-
// bitpacking operation compresses an MxN matrix to a Mx(N/bitwidth)
375-
// matrix.
376-
template <BitpackOrder bitpack_order, class TIn, class TOutContainer>
377-
inline void packbits_matrix(const TIn* input_data,
378-
const std::size_t input_num_rows,
379-
const std::size_t input_num_cols,
380-
TOutContainer& output, std::size_t& output_num_rows,
381-
std::size_t& output_num_cols,
382-
std::size_t& output_bitpadding,
383-
const Axis bitpacking_axis,
389+
// Bitpacks each row of a row-major matrix
390+
template <BitpackOrder bitpack_order, class TIn, class TOut_>
391+
inline void packbits_matrix(const TIn* input, const std::size_t input_num_rows,
392+
const std::size_t input_num_cols, TOut_* output,
384393
const std::int32_t zero_point = 0) {
385394
// Force the types to be unsigned so that the function can be called on signed
386395
// types as well
387-
using TOut =
388-
typename std::make_unsigned<typename TOutContainer::value_type>::type;
389-
const std::size_t bitwidth = std::numeric_limits<TOut>::digits;
390-
391-
if (bitpacking_axis == Axis::RowWise) {
392-
// calculate size of bitpacked matrix and allocate its memory
393-
output_num_rows = input_num_rows;
394-
output_num_cols = (input_num_cols + bitwidth - 1) / bitwidth;
395-
396-
const auto output_size = output_num_cols * output_num_rows;
397-
output.resize(output_size);
398-
399-
const auto input_row_size = input_num_cols;
400-
const auto output_row_size = output_num_cols;
401-
402-
const auto num_extra_elems = input_num_cols % bitwidth;
403-
output_bitpadding = num_extra_elems == 0 ? 0 : bitwidth - num_extra_elems;
404-
405-
// iterate through each row of the input matrix and bitpack the row into
406-
// the corresponding memory location of the output matrix
407-
TOut* output_data = reinterpret_cast<TOut*>(output.data());
408-
for (size_t row_index = 0; row_index < input_num_rows; ++row_index)
409-
packbits_array<bitpack_order>(
410-
GET_POINTER_TO_ROW(input_data, input_row_size, row_index),
411-
input_row_size,
412-
GET_POINTER_TO_ROW(output_data, output_row_size, row_index),
413-
zero_point);
414-
return;
415-
}
416-
417-
if (bitpacking_axis == Axis::ColWise) {
418-
// calculate size of bitpacked matrix and allocate its memory
419-
output_num_rows = (input_num_rows + bitwidth - 1) / bitwidth;
420-
output_num_cols = input_num_cols;
421-
const auto output_size = output_num_cols * output_num_rows;
422-
output.resize(output_size);
423-
424-
const auto input_row_size = input_num_cols;
425-
const auto output_row_size = output_num_cols;
426-
427-
const auto num_extra_elems = input_num_rows % bitwidth;
428-
output_bitpadding = num_extra_elems == 0 ? 0 : bitwidth - num_extra_elems;
429-
430-
// allocate temporary buffers
431-
std::vector<TIn> input_buffer(input_num_rows);
432-
std::vector<TOut> output_buffer(output_num_rows);
433-
434-
TOut* output_data = reinterpret_cast<TOut*>(output.data());
435-
436-
// iterate through the columns
437-
for (size_t col_index = 0; col_index < input_num_cols; ++col_index) {
438-
// store the values of the current column in a buffer
439-
for (size_t row_index = 0; row_index < input_num_rows; ++row_index)
440-
input_buffer[row_index] =
441-
input_data[GET_ELEM_INDEX(row_index, col_index, input_row_size)];
442-
443-
// bitpack the buffer and store it in the the output matrix
444-
packbits_array<bitpack_order>(input_buffer.data(), input_buffer.size(),
445-
output_buffer.data(), zero_point);
446-
447-
// store the bitpacked values of the current column in the output matrix
448-
for (size_t row_index = 0; row_index < output_num_rows; ++row_index)
449-
output_data[GET_ELEM_INDEX(row_index, col_index, output_row_size)] =
450-
output_buffer[row_index];
451-
}
452-
return;
396+
using TOut = typename std::make_unsigned<TOut_>::type;
397+
398+
// Calculate the size of the bitpacked rows
399+
const std::size_t output_num_cols = GetPackedSize<TOut>(input_num_cols);
400+
401+
// Iterate through each row of the input matrix and bitpack the row into the
402+
// corresponding memory location of the output matrix
403+
const TIn* input_ptr = input;
404+
TOut* output_ptr = reinterpret_cast<TOut*>(output);
405+
for (size_t row_index = 0; row_index < input_num_rows; ++row_index) {
406+
packbits_array<bitpack_order>(input_ptr, input_num_cols, output_ptr,
407+
zero_point);
408+
input_ptr += input_num_cols;
409+
output_ptr += output_num_cols;
453410
}
411+
return;
454412
}
455413

456414
template <typename TBitpacked, typename TUnpacked>

larq_compute_engine/core/packbits_utils.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,36 @@ namespace compute_engine {
1111
namespace ce = compute_engine;
1212
namespace core {
1313

14+
template <typename TBitpacked>
15+
int GetPackedTensorSize(const RuntimeShape& shape) {
16+
constexpr auto bitwidth = std::numeric_limits<
17+
typename std::make_unsigned<TBitpacked>::type>::digits;
18+
const int dims = shape.DimensionsCount();
19+
// Pack the tensor along the last dimension
20+
const int rows = FlatSizeSkipDim(shape, dims - 1);
21+
const int cols = shape.Dims(dims - 1);
22+
return ce::core::GetPackedMatrixSize(rows, cols, bitwidth);
23+
}
24+
1425
// Convenience function for bitpacking a tensor along its last dimension
1526
// and updating the tensor shape
1627
template <class T, class TBitpacked>
1728
inline void packbits_tensor(const RuntimeShape& in_shape, const T* in_data,
1829
const std::int32_t zero_point,
19-
RuntimeShape& out_shape,
20-
std::vector<TBitpacked>& out_data) {
30+
RuntimeShape& out_shape, TBitpacked* out_data) {
2131
const int dims = in_shape.DimensionsCount();
2232
// Pack the tensor along the last dimension
2333
const int rows = FlatSizeSkipDim(in_shape, dims - 1);
2434
const int cols = in_shape.Dims(dims - 1);
2535

26-
std::size_t rows_bp = 0, cols_bp = 0;
27-
std::size_t bitpadding = 0;
2836
{
2937
gemmlowp::ScopedProfilingLabel label("Packbits");
3038
ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
31-
in_data, rows, cols, out_data, rows_bp, cols_bp, bitpadding,
32-
ce::core::Axis::RowWise, zero_point);
39+
in_data, rows, cols, out_data, zero_point);
3340
}
3441

3542
out_shape.ReplaceWith(dims, in_shape.DimsData());
36-
out_shape.SetDim(dims - 1, cols_bp);
43+
out_shape.SetDim(dims - 1, GetPackedSize<TBitpacked>(cols));
3744
}
3845

3946
// Convenience function for going from a shape to the packed shape

larq_compute_engine/core/tests/packbits_tests.cc

Lines changed: 15 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@ namespace ce = compute_engine;
1616
// operation itself while the uniform tests are used to verify the bitpacking
1717
// with different bitwidths
1818
template <class TIn>
19-
void test_bitpacking_nonuniform_input_rowwise() {
20-
const auto bitpacking_axis = ce::core::Axis::RowWise;
21-
19+
void test_bitpacking_nonuniform_input() {
2220
// input matrix (row-major memory laytout)
2321
const std::size_t num_rows = 2, num_cols = 8;
2422
std::array<TIn, 16> input{1, 1, -1, 1, -1, -1, -1, 1,
@@ -32,164 +30,71 @@ void test_bitpacking_nonuniform_input_rowwise() {
3230
else
3331
expected = {0b01110100, 0b00111010};
3432

35-
std::vector<std::uint8_t> output;
36-
std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
37-
ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
38-
input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
39-
bitpadding, bitpacking_axis);
40-
41-
EXPECT_EQ(num_rows_bp, expected_num_rows);
42-
EXPECT_EQ(num_cols_bp, expected_num_cols);
43-
EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
44-
EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
45-
}
46-
47-
template <class TIn>
48-
void test_bitpacking_nonuniform_input_colwise() {
49-
const auto bitpacking_axis = ce::core::Axis::ColWise;
50-
51-
// input matrix (row-major memory laytout)
52-
const std::size_t num_rows = 8, num_cols = 2;
53-
std::array<TIn, 16> input{1, 1, 1, -1, -1, 1, 1, -1,
54-
-1, -1, -1, -1, -1, 1, 1, 1};
55-
56-
// expected output matrix after bitpacking
57-
const std::size_t expected_num_rows = 1, expected_num_cols = 2;
58-
std::vector<std::uint8_t> expected;
59-
if (CE_IS_BIG_ENDIAN)
60-
expected = {0b00101110, 0b01011100};
61-
else
62-
expected = {0b01110100, 0b00111010};
63-
64-
std::vector<std::uint8_t> output;
65-
std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
33+
std::vector<std::uint8_t> output(
34+
ce::core::GetPackedMatrixSize<std::uint8_t>(num_rows, num_cols));
6635
ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
67-
input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
68-
bitpadding, bitpacking_axis);
36+
input.data(), num_rows, num_cols, output.data());
6937

70-
EXPECT_EQ(num_rows_bp, expected_num_rows);
71-
EXPECT_EQ(num_cols_bp, expected_num_cols);
72-
EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
7338
EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
7439
}
7540

7641
template <class TIn, class TOut, std::size_t num_rows, std::size_t num_cols>
77-
void test_bitpacking(const ce::core::Axis bitpacking_axis,
78-
const std::size_t expected_num_rows,
42+
void test_bitpacking(const std::size_t expected_num_rows,
7943
const std::size_t expected_num_cols) {
8044
const std::size_t bitwidth = std::numeric_limits<TOut>::digits;
8145

8246
const std::size_t num_elems = num_rows * num_cols;
8347
std::array<TIn, num_elems> input;
8448
input.fill(-1);
8549

86-
std::vector<TOut> output;
87-
std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
50+
std::vector<TOut> output(
51+
ce::core::GetPackedMatrixSize<TOut>(num_rows, num_cols));
8852
ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
89-
input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
90-
bitpadding, bitpacking_axis);
53+
input.data(), num_rows, num_cols, output.data());
9154

9255
TOut expected_value = std::numeric_limits<TOut>::max();
9356
const std::size_t num_elems_bp = num_elems / bitwidth;
9457
std::array<TOut, num_elems_bp> expected;
9558
expected.fill(expected_value);
9659

97-
EXPECT_EQ(num_rows_bp, expected_num_rows);
98-
EXPECT_EQ(num_cols_bp, expected_num_cols);
99-
EXPECT_EQ(bitpadding, 0);
100-
EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
10160
EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
10261
}
10362

10463
TEST(BitpackingTests, BitpackingRowMajorUInt8NonUniformInput) {
105-
test_bitpacking_nonuniform_input_rowwise<float>();
106-
}
107-
108-
TEST(BitpackingTests, BitpackingColMajorUInt8NonUniformInput) {
109-
test_bitpacking_nonuniform_input_colwise<float>();
64+
test_bitpacking_nonuniform_input<float>();
11065
}
11166

11267
TEST(BitpackingTests, BitpackingRowMajorUInt8) {
113-
test_bitpacking<float, std::uint8_t, 2, 128>(ce::core::Axis::RowWise, 2, 16);
68+
test_bitpacking<float, std::uint8_t, 2, 128>(2, 16);
11469
}
11570

11671
TEST(BitpackingTests, BitpackingRowMajorUInt32) {
117-
test_bitpacking<float, std::uint32_t, 2, 128>(ce::core::Axis::RowWise, 2, 4);
72+
test_bitpacking<float, std::uint32_t, 2, 128>(2, 4);
11873
}
11974

12075
TEST(BitpackingTests, BitpackingRowMajorUInt64) {
121-
test_bitpacking<float, std::uint64_t, 2, 128>(ce::core::Axis::RowWise, 2, 2);
122-
}
123-
124-
TEST(BitpackingTests, BitpackingColumnMajorUInt8) {
125-
test_bitpacking<float, std::uint8_t, 128, 2>(ce::core::Axis::ColWise, 16, 2);
126-
}
127-
128-
TEST(BitpackingTests, BitpackingColumnMajorUInt32) {
129-
test_bitpacking<float, std::uint32_t, 128, 2>(ce::core::Axis::ColWise, 4, 2);
130-
}
131-
132-
TEST(BitpackingTests, BitpackingColumnMajorUInt64) {
133-
test_bitpacking<float, std::uint64_t, 128, 2>(ce::core::Axis::ColWise, 2, 2);
76+
test_bitpacking<float, std::uint64_t, 2, 128>(2, 2);
13477
}
13578

13679
TEST(BitpackingWithBitPaddingTests, RowMajorPadding) {
137-
const auto bitpacking_axis = ce::core::Axis::RowWise;
13880
// input matrix
13981
const int num_rows = 2;
14082
const int num_cols = 9;
14183
std::vector<float> input{-1, -1, 1, -1, 1, 1, 1, -1, -1,
14284
-1, 1, -1, 1, 1, 1, -1, -1, 1};
14385

14486
// expected output matrix after bitpacking
145-
const std::size_t expected_num_rows = 2;
146-
const std::size_t expected_num_cols = 2;
14787
std::vector<std::uint8_t> expected;
14888
if (CE_IS_BIG_ENDIAN)
14989
expected = {0b11010001, 0b10000000, 0b10100011, 0b00000000};
15090
else
15191
expected = {0b10001011, 0b00000001, 0b11000101, 0b00000000};
15292

153-
std::vector<std::uint8_t> output;
154-
std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
155-
ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
156-
input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
157-
bitpadding, bitpacking_axis);
158-
159-
EXPECT_EQ(num_rows_bp, expected_num_rows);
160-
EXPECT_EQ(num_cols_bp, expected_num_cols);
161-
EXPECT_EQ(bitpadding, 7);
162-
EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
163-
EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
164-
};
165-
166-
TEST(BitpackingWithBitPaddingTests, ColMajorPadding) {
167-
const auto bitpacking_axis = ce::core::Axis::ColWise;
168-
// The input matrix is:
169-
const int num_rows = 9;
170-
const int num_cols = 2;
171-
std::vector<float> input{-1, -1, -1, 1, 1, -1, -1, 1, 1,
172-
1, 1, 1, 1, -1, -1, -1, -1, 1};
173-
174-
// expected output matrix after bitpacking
175-
const std::size_t expected_num_rows = 2;
176-
const std::size_t expected_num_cols = 2;
177-
std::vector<std::uint8_t> expected;
178-
if (CE_IS_BIG_ENDIAN)
179-
expected = {0b11010001, 0b10100011, 0b10000000, 0b00000000};
180-
else
181-
expected = {0b10001011, 0b11000101, 0b00000001, 0b00000000};
182-
183-
std::vector<std::uint8_t> output;
184-
std::size_t num_rows_bp = 0, num_cols_bp = 0, bitpadding = 0;
93+
std::vector<std::uint8_t> output(
94+
ce::core::GetPackedMatrixSize<std::uint8_t>(num_rows, num_cols));
18595
ce::core::packbits_matrix<ce::core::BitpackOrder::Optimized>(
186-
input.data(), num_rows, num_cols, output, num_rows_bp, num_cols_bp,
187-
bitpadding, bitpacking_axis);
96+
input.data(), num_rows, num_cols, output.data());
18897

189-
EXPECT_EQ(num_rows_bp, expected_num_rows);
190-
EXPECT_EQ(num_cols_bp, expected_num_cols);
191-
EXPECT_EQ(bitpadding, 7);
192-
EXPECT_EQ(output.size(), num_rows_bp * num_cols_bp);
19398
EXPECT_THAT(output, ::testing::ElementsAreArray(expected));
19499
};
195100

larq_compute_engine/mlir/transforms/optimize.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,9 @@ DenseElementsAttr Bitpack(PatternRewriter& builder, Attribute x) {
4545
std::vector<PackedType> new_values(num_rows * packed_channels);
4646

4747
const float* in_ptr = &(*dense_elements_iter.begin());
48-
std::size_t filter_rows_bp, filter_cols_bp, filter_bitpadding;
4948
using namespace compute_engine::core;
50-
packbits_matrix<BitpackOrder::Canonical>(
51-
in_ptr, num_rows, unpacked_channels, new_values, filter_rows_bp,
52-
filter_cols_bp, filter_bitpadding, Axis::RowWise);
49+
packbits_matrix<BitpackOrder::Canonical>(in_ptr, num_rows, unpacked_channels,
50+
new_values.data());
5351

5452
RankedTensorType out_tensor_type =
5553
RankedTensorType::get({shape[0], shape[1], shape[2], packed_channels},

0 commit comments

Comments
 (0)