Skip to content

Commit 43732c5

Browse files
authored
Add null pages and boundary order (Fixes #92) (#94)
Problem ======= Parquet file column indexes are required to have `null_pages` and `boundary_order`, but they were missing from Parquetjs generated files. https://github.com/apache/parquet-format/blob/1603152f8991809e8ad29659dffa224b4284f31b/src/main/thrift/parquet.thrift#L955 Closes #92 Solution ======== Note: While required, the requirement is not always a hard requirement depending on the library. Steps to Verify: ---------------- 1. Checkout the branch 2. `npm i && npm run build && npm pack ` 3. Install parquet cli tools (macOS brew: `brew install parquet-cli`) 4. Checkout the bug repo from #92 https://github.com/noxify/parquetjs_bug/ 5. `cd parquetjs_bug/parquetjs && npm i` 6. `node index.js && parquet column-index ../generated_files/parquetjs/change.parque` will FAIL 7. npm i ../parquetjs/dsnp-parquetjs-0.0.0.tgz 8 `node index.js && parquet column-index ../generated_files/parquetjs/change.parque` will PASS!
1 parent 19f3ffa commit 43732c5

File tree

2 files changed

+21
-0
lines changed

2 files changed

+21
-0
lines changed

lib/writer.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,8 +634,11 @@ async function encodeColumnChunk(pages: Page[], opts: {column: ParquetField, bas
634634

635635
/* compile statistics ColumnIndex and OffsetIndex*/
636636
let columnIndex = new parquet_thrift.ColumnIndex();
637+
columnIndex.null_pages = [];
637638
columnIndex.max_values = [];
638639
columnIndex.min_values = [];
640+
// Default to unordered
641+
columnIndex.boundary_order = 0;
639642
let offsetIndex = new parquet_thrift.OffsetIndex();
640643
offsetIndex.page_locations = [];
641644

@@ -659,6 +662,8 @@ async function encodeColumnChunk(pages: Page[], opts: {column: ParquetField, bas
659662
statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0));
660663
page.distinct_values.forEach((value: unknown) => distinct_values.add(value));
661664

665+
// If the number of values and the count of nulls are the same, this is a null page
666+
columnIndex.null_pages.push( page.num_values === statistics.null_count.valueOf() );
662667
columnIndex.max_values.push( encodeStatisticsValue(page.statistics.max_value, opts.column) );
663668
columnIndex.min_values.push( encodeStatisticsValue(page.statistics.min_value, opts.column) );
664669
}

test/statistics.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,34 +150,50 @@ describe('statistics', async function() {
150150
const name = await reader.envelopeReader.readColumnIndex('name', row);
151151
assert.deepEqual(name.min_values, ['apples','banana']);
152152
assert.deepEqual(name.max_values, ['oranges','banana']);
153+
assert.deepEqual(name.null_pages, [false, false]);
154+
assert.deepEqual(name.boundary_order, 0);
153155

154156
const quantity = await reader.envelopeReader.readColumnIndex('quantity', row);
155157
assert.deepEqual(quantity.min_values, [10n, undefined]);
156158
assert.deepEqual(quantity.max_values, [20n, undefined]);
159+
assert.deepEqual(quantity.null_pages, [false, false]);
160+
assert.deepEqual(quantity.boundary_order, 0);
157161

158162
const price = await reader.envelopeReader.readColumnIndex('price', row);
159163
assert.deepEqual(price.min_values, [2.6, 3.2]);
160164
assert.deepEqual(price.max_values, [4.2, 3.2]);
165+
assert.deepEqual(price.null_pages, [false, false]);
166+
assert.deepEqual(price.boundary_order, 0)
161167

162168
const day = await reader.envelopeReader.readColumnIndex('day', row);
163169
assert.deepEqual(day.min_values, [ new Date('2008-11-26'), new Date('2017-11-26') ]);
164170
assert.deepEqual(day.max_values, [ new Date('2018-03-03'), new Date('2017-11-26') ]);
171+
assert.deepEqual(day.null_pages, [false, false]);
172+
assert.deepEqual(day.boundary_order, 0)
165173

166174
const finger = await reader.envelopeReader.readColumnIndex('finger', row);
167175
assert.deepEqual(finger.min_values, [ Buffer.from('ABCDE'), Buffer.from('FNORD') ]);
168176
assert.deepEqual(finger.max_values, [ Buffer.from('XCVBN'), Buffer.from('FNORD')]);
177+
assert.deepEqual(finger.null_pages, [false, false]);
178+
assert.deepEqual(finger.boundary_order, 0)
169179

170180
const stockQuantity = await reader.envelopeReader.readColumnIndex('stock,quantity', row);
171181
assert.deepEqual(stockQuantity.min_values, [ 10n, undefined ]);
172182
assert.deepEqual(stockQuantity.max_values, [ 50n, undefined ]);
183+
assert.deepEqual(stockQuantity.null_pages, [false, false]);
184+
assert.deepEqual(stockQuantity.boundary_order, 0)
173185

174186
const stockWarehouse = await reader.envelopeReader.readColumnIndex('stock,warehouse', row);
175187
assert.deepEqual(stockWarehouse.min_values, [ 'A', undefined ]);
176188
assert.deepEqual(stockWarehouse.max_values, [ 'x', undefined ]);
189+
assert.deepEqual(stockWarehouse.null_pages, [false, false]);
190+
assert.deepEqual(stockWarehouse.boundary_order, 0)
177191

178192
const colour = await reader.envelopeReader.readColumnIndex('colour', row);
179193
assert.deepEqual(colour.min_values, [ 'brown', 'yellow' ]);
180194
assert.deepEqual(colour.max_values, [ 'yellow', 'yellow' ]);
195+
assert.deepEqual(colour.null_pages, [false, false]);
196+
assert.deepEqual(colour.boundary_order, 0)
181197

182198
const inter = await reader.envelopeReader.readColumnIndex('inter', row).catch(e => e);
183199
assert.equal(inter.message,'Column Index Missing');

0 commit comments

Comments
 (0)