Skip to content

Commit 8d34ac1

Browse files
authored
Reference Tests and Breaking Change: Optional nullable fields are now null instead of undefined (#114)
Problem ======= We wanted to add tests for all the tests in https://github.com/apache/parquet-testing ### Discovered Bugs - We treated nulls as undefined, but others don't - We incorrectly processed dictionary_page_offset >= 0 instead of only > 0 Solution ======== - Added new test that automatically tests all files: `test/reference-test/read-all.test.ts` - Fixed found bugs with @shannonwells Steps to Verify: ---------------- 1. Run the tests 1. Comment out the bug fixes and see reference test files fail
1 parent 6fdb9da commit 8d34ac1

File tree

60 files changed

+92
-13
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+92
-13
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ npm-debug.log
55
.nyc_output
66
dist
77
!test/test-files/*.parquet
8+
!test/reference-test/files/*.parquet
89
examples/server/package-lock.json
9-
test/browser/*.js
10+
test/browser/*.js

lib/reader.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -672,8 +672,9 @@ export class ParquetEnvelopeReader {
672672
num_values: metadata.num_values
673673
});
674674

675-
if (metadata.dictionary_page_offset) {
676-
const offset = +metadata.dictionary_page_offset;
675+
// If this exists and is greater than zero then we need to have an offset
676+
if (metadata.dictionary_page_offset && +metadata.dictionary_page_offset > 0) {
677+
const offset: number = +metadata.dictionary_page_offset;
677678
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size);
678679

679680
await this.read(offset, size, colChunk.file_path).then(async (buffer: Buffer) => {

lib/shred.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@ function materializeRecordField(record: Record<string, unknown>, branch: Array<P
227227
const node = branch[0];
228228

229229
if (dLevel < node.dLevelMax) {
230+
// This ensures that nulls are correctly processed
231+
record[node.name] = value;
230232
return;
231233
}
232234

test/integration.js

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,8 @@ async function readTestFile() {
303303
{ quantity: [10n], warehouse: "A" },
304304
{ quantity: [20n], warehouse: "B" }
305305
],
306-
colour: [ 'green', 'red' ]
306+
colour: [ 'green', 'red' ],
307+
meta_json: null,
307308
});
308309

309310
assert.deepEqual(await cursor.next(), {
@@ -317,11 +318,13 @@ async function readTestFile() {
317318
stock: [
318319
{ quantity: [50n, 33n], warehouse: "X" }
319320
],
320-
colour: [ 'orange' ]
321+
colour: [ 'orange' ],
322+
meta_json: null,
321323
});
322324

323325
assert.deepEqual(await cursor.next(), {
324326
name: 'kiwi',
327+
quantity: null,
325328
price: 4.2,
326329
day: new Date('2017-11-26'),
327330
date: new Date(TEST_VTIME + 8000 * i),
@@ -337,11 +340,13 @@ async function readTestFile() {
337340

338341
assert.deepEqual(await cursor.next(), {
339342
name: 'banana',
343+
quantity: null,
340344
price: 3.2,
341345
day: new Date('2017-11-26'),
342346
date: new Date(TEST_VTIME + 6000 * i),
343347
finger: Buffer.from("FNORD"),
344348
inter: { months: 42, days: 23, milliseconds: 777 },
349+
stock: null,
345350
colour: [ 'yellow' ],
346351
meta_json: { shape: 'curved' }
347352
});
@@ -366,8 +371,8 @@ async function readTestFile() {
366371
for (let i = 0; i < TEST_NUM_ROWS; ++i) {
367372
assert.deepEqual(await cursor.next(), { name: 'apples', quantity: 10n });
368373
assert.deepEqual(await cursor.next(), { name: 'oranges', quantity: 20n });
369-
assert.deepEqual(await cursor.next(), { name: 'kiwi' });
370-
assert.deepEqual(await cursor.next(), { name: 'banana' });
374+
assert.deepEqual(await cursor.next(), { name: 'kiwi', quantity: null });
375+
assert.deepEqual(await cursor.next(), { name: 'banana', quantity: null });
371376
}
372377

373378
assert.equal(await cursor.next(), null);

test/reference-test/README.md

Lines changed: 12 additions & 0 deletions
Binary file not shown.
1.81 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
478 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
478 Bytes
Binary file not shown.
Binary file not shown.
591 Bytes
Binary file not shown.
Binary file not shown.
2.47 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
329 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
502 Bytes
Binary file not shown.
Binary file not shown.
461 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
660 Bytes
Binary file not shown.

test/reference-test/read-all.test.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { expect } from "chai";
2+
import path from "node:path";
3+
import fs from "node:fs";
4+
5+
import parquet from '../../parquet';
6+
7+
// Used for testing a single file. Example:
8+
// const onlyTest = 'single_nan.parquet';
9+
const onlyTest = null;
10+
11+
// Test files currently unsupported / needing separate test
12+
const unsupported = [
13+
'byte_stream_split.zstd.parquet', // ZSTD unsupported
14+
'hadoop_lz4_compressed.parquet', // LZ4 unsupported
15+
'hadoop_lz4_compressed_larger.parquet', // LZ4 unsupported
16+
'lz4_raw_compressed.parquet', // LZ4_RAW unsupported
17+
'lz4_raw_compressed_larger.parquet', // LZ4_RAW unsupported
18+
'nested_structs.rust.parquet', // ZSTD unsupported
19+
'non_hadoop_lz4_compressed.parquet', // ZSTD unsupported
20+
'rle_boolean_encoding.parquet', // BUG?: https://github.com/LibertyDSNP/parquetjs/issues/113
21+
'datapage_v2.snappy.parquet', // DELTA_BINARY_PACKED unsupported
22+
'delta_binary_packed.parquet', // DELTA_BINARY_PACKED unsupported
23+
'delta_byte_array.parquet', // DELTA_BYTE_ARRAY unsupported
24+
'delta_encoding_optional_column.parquet', // DELTA_BINARY_PACKED unsupported
25+
'delta_encoding_required_column.parquet', // DELTA_BINARY_PACKED unsupported
26+
'delta_length_byte_array.parquet', // ZSTD unsupported, DELTA_BINARY_PACKED unsupported
27+
'float16_nonzeros_and_nans.parquet', // missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)
28+
'float16_zeros_and_nans.parquet', // missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)
29+
'large_string_map.brotli.parquet', // BUG?
30+
];
31+
32+
describe("Read Test for all files", function () {
33+
34+
const listOfFiles = fs.readdirSync(path.join(__dirname, 'files'))
35+
.filter(x => x.endsWith(".parquet") && !unsupported.includes(x));
36+
37+
for (const filename of listOfFiles) {
38+
if (onlyTest && onlyTest !== filename) continue;
39+
it(`Reading ${filename}`, async function () {
40+
const reader = await parquet.ParquetReader.openFile(path.join(__dirname, 'files', filename));
41+
const schema = reader.getSchema();
42+
expect(schema.fieldList).to.have.length.greaterThan(0);
43+
const cursor = reader.getCursor();
44+
const record = await cursor.next() as any;
45+
// Expect the same keys as top-level fields
46+
const expectedRecordKeys = schema.fieldList.filter(x => x.path.length === 1).map(x => x.name);
47+
expect(Object.keys(record)).to.deep.equal(expectedRecordKeys);
48+
})
49+
}
50+
});

test/shred.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -498,11 +498,11 @@ describe('ParquetShredder', function() {
498498

499499
assert.deepEqual(
500500
records[2],
501-
{ name: "kiwi", price: 99.0 });
501+
{ name: "kiwi", price: 99.0, stock: null });
502502

503503
assert.deepEqual(
504504
records[3],
505-
{ name: "banana", stock: [{ warehouse: "C" }], price: 42.0 });
505+
{ name: "banana", stock: [{ quantity: null, warehouse: "C" }], price: 42.0 });
506506
});
507507

508508
it('should materialize a static nested record with blank optional value', function() {
@@ -549,7 +549,7 @@ describe('ParquetShredder', function() {
549549

550550
assert.deepEqual(
551551
records[0],
552-
{ fruit: { name: "apple" } });
552+
{ fruit: { name: "apple", colour: null } });
553553

554554
});
555555

test/test-files.js

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ describe('test-files', function() {
109109

110110
it('test-converted-type-null.parquet loads', async function() {
111111
const data = await readData('test-converted-type-null.parquet');
112-
assert.deepEqual(data,[{foo: 'bar'},{}]);
112+
assert.deepEqual(data,[{foo: 'bar'},{foo: null}]);
113113
});
114114

115115
it('test-enum-type.parquet loads', async function() {
@@ -119,12 +119,20 @@ describe('test-files', function() {
119119

120120
it('test-null-dictionary.parquet loads', async function() {
121121
const data = await readData('test-null-dictionary.parquet');
122-
assert.deepEqual(data,[].concat.apply([{}],[...Array(3)].map( () => ([{foo: 'bar'}, {foo: 'baz'}]))));
122+
assert.deepEqual(
123+
data,
124+
[
125+
{ foo: null },
126+
{ foo: 'bar' }, { foo: 'baz' },
127+
{ foo: 'bar' }, { foo: 'baz' },
128+
{ foo: 'bar' }, { foo: 'baz' }
129+
]
130+
);
123131
});
124132

125133
it('test-null.parquet loads', async function() {
126134
const data = await readData('test-null.parquet');
127-
assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1}]);
135+
assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1, bar: null}]);
128136
});
129137

130138
it('test.parquet loads', async function() {

0 commit comments

Comments
 (0)