Skip to content

Commit a011a2e

Browse files
dgaudetwilwade
andauthored
Feature - collect and report multiple field errors (#75)
Problem ======= This PR is intended to implement 2 enhancements to schema error reporting. * When a parquet schema includes an invalid type, encoding or compression the current error does not indicate which column has the the problem * When a parquet schema has multiple issues, the code currently fails on the first, making multiple errors quite cumbersome Solution ======== Modified the schema.ts and added tests to: * Change error messages from the original `invalid parquet type: UNKNOWN` to `invalid parquet type: UNKNOWN, for Column: quantity` * Keep track of schema errors as we loop through each column in the schema, and at the end, if there are any errors report them all as below: `invalid parquet type: UNKNOWN, for Column: quantity` `invalid parquet type: UNKNOWN, for Column: value` Change summary: --------------- * adding tests and code to ensure multiple field errors are logged, as well as indicating which column had the error * also adding code to handle multiple encoding and compression schema issues Steps to Verify: ---------------- 1. Download this [parquet file](https://usaz02prismdevmlaas01.blob.core.windows.net/ml-job-config/dataSets/multiple-unsupported-columns.parquet?sv=2020-10-02&st=2023-01-09T15%3A28%3A09Z&se=2025-01-10T15%3A28%3A00Z&sr=b&sp=r&sig=GS0Skk93DCn5CnC64DbnIH2U7JhzHM2nnhq1U%2B2HwPs%3D) 2. attempt to open this parquet with this library `const reader = await parquet.ParquetReader.openFile(<path to parquet file>)` 3. You should receive errors for more than one column, which also includes the column name for each error --------- Co-authored-by: Wil Wade <[email protected]>
1 parent a62db08 commit a011a2e

File tree

2 files changed

+71
-3
lines changed

2 files changed

+71
-3
lines changed

lib/schema.ts

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP
8181
}
8282

8383
let fieldList: Record<string, ParquetField> = {};
84+
let fieldErrors: Array<string> = [];
8485
for (let name in schema) {
8586
const opts = schema[name];
8687

@@ -129,9 +130,15 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP
129130
continue;
130131
}
131132

133+
let nameWithPath = (`${name}` || 'missing name')
134+
if (path && path.length > 0) {
135+
nameWithPath = `${path}.${nameWithPath}`
136+
}
137+
132138
const typeDef = opts.type ? parquet_types.PARQUET_LOGICAL_TYPES[opts.type] : undefined;
133139
if (!typeDef) {
134-
throw 'invalid parquet type: ' + (opts.type || "missing type");
140+
fieldErrors.push(`Invalid parquet type: ${(opts.type || "missing type")}, for Column: ${nameWithPath}`);
141+
continue;
135142
}
136143

137144
/* field encoding */
@@ -140,15 +147,15 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP
140147
}
141148

142149
if (!(opts.encoding in parquet_codec)) {
143-
throw 'unsupported parquet encoding: ' + opts.encoding;
150+
fieldErrors.push(`Unsupported parquet encoding: ${opts.encoding}, for Column: ${nameWithPath}`);
144151
}
145152

146153
if (!opts.compression) {
147154
opts.compression = 'UNCOMPRESSED';
148155
}
149156

150157
if (!(opts.compression in parquet_compression.PARQUET_COMPRESSION_METHODS)) {
151-
throw 'unsupported compression method: ' + opts.compression;
158+
fieldErrors.push(`Unsupported compression method: ${opts.compression}, for Column: ${nameWithPath}`);
152159
}
153160

154161
/* add to schema */
@@ -167,6 +174,10 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP
167174
};
168175
}
169176

177+
if (fieldErrors.length > 0) {
178+
throw fieldErrors.reduce((accumulator, currentVal) => accumulator + '\n' + currentVal);
179+
}
180+
170181
return fieldList;
171182
}
172183

test/schema.js

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,4 +467,61 @@ describe('ParquetSchema', function() {
467467
}
468468
});
469469

470+
it('should indicate which column had an invalid type in a simple flat schema', function() {
471+
assert.throws(() => {
472+
new parquet.ParquetSchema({
473+
quantity: {type: 'UNKNOWN'},
474+
})
475+
}, 'Invalid parquet type: UNKNOWN, for Column: quantity');
476+
});
477+
478+
it('should indicate each column which has an invalid type in a simple flat schema', function() {
479+
assert.throws(() => {
480+
new parquet.ParquetSchema({
481+
quantity: {type: 'UNKNOWN'},
482+
value: {type: 'UNKNOWN'},
483+
})
484+
}, 'Invalid parquet type: UNKNOWN, for Column: quantity\nInvalid parquet type: UNKNOWN, for Column: value');
485+
});
486+
487+
it('should indicate each column which has an invalid type when one is correct in a simple flat schema', function() {
488+
assert.throws(() => {
489+
new parquet.ParquetSchema({
490+
quantity: {type: 'INT32'},
491+
value: {type: 'UNKNOWN'},
492+
})
493+
}, 'Invalid parquet type: UNKNOWN, for Column: value');
494+
});
495+
496+
it('should indicate each column which has an invalid type in a nested schema', function() {
497+
assert.throws(() => {
498+
new parquet.ParquetSchema({
499+
name: { type: 'UTF8' },
500+
stock: {
501+
fields: {
502+
quantity: { type: 'UNKNOWN' },
503+
warehouse: { type: 'UNKNOWN' },
504+
}
505+
},
506+
price: { type: 'UNKNOWN' },
507+
})
508+
}, 'Invalid parquet type: UNKNOWN, for Column: stock.quantity\nInvalid parquet type: UNKNOWN, for Column: stock.warehouse');
509+
});
510+
511+
it('should indicate which column had an invalid encoding in a simple flat schema', function() {
512+
assert.throws(() => {
513+
new parquet.ParquetSchema({
514+
quantity: {type: 'INT32', encoding: 'UNKNOWN'},
515+
})
516+
}, 'Unsupported parquet encoding: UNKNOWN, for Column: quantity');
517+
});
518+
519+
it('should indicate which column had an invalid compression type in a simple flat schema', function() {
520+
assert.throws(() => {
521+
new parquet.ParquetSchema({
522+
quantity: {type: 'INT32', compression: 'UNKNOWN'},
523+
})
524+
}, 'Unsupported compression method: UNKNOWN, for Column: quantity');
525+
});
526+
470527
});

0 commit comments

Comments
 (0)