Skip to content

Commit c07e7e8

Browse files
YECHUNANwilwade
andauthored
Add support to byte array decimal fields (#97)
Problem ======= Address #91 Solution ======== When encountering such byte array represented "Decimal" fields, parse them into raw buffers. Change summary: --------------- - Added code to parse "Decimal" type fields represented by byte arrays (fixed length or non-fixed length) into raw buffer values for further client side processing. - Added two test cases verifying the added code. - Loosen the precision check to allow values greater than 18 for byte array represented "Decimal" fields. Steps to Verify: ---------------- - Use the library to open a parquet file which contains a "Decimal" field represented by a byte array whose precision is greater than 18. - Before the change, library will throw an error saying precision cannot be greater than 18. - After the change, library will parse those fields to their raw buffer values and return records normally. --------- Co-authored-by: Wil Wade <[email protected]>
1 parent ac5257d commit c07e7e8

14 files changed

+785
-643
lines changed

esbuild-serve.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ require('esbuild')
1111
}, {
1212
entryPoints: ['parquet.ts'],
1313
outfile: 'main.js',
14-
define: {"process.env.NODE_DEBUG": false, "process.env.NODE_ENV": "\"production\"", global: "window" },
14+
define: {"process.env.NODE_DEBUG": "false", "process.env.NODE_ENV": "\"production\"", global: "window" },
1515
platform: 'browser',
1616
plugins: [compressionBrowserPlugin,wasmPlugin],
1717
sourcemap: "external",

esbuild.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const baseConfig = {
66
bundle: true,
77
entryPoints: ['parquet.ts'],
88
define: {
9-
"process.env.NODE_DEBUG": false,
9+
"process.env.NODE_DEBUG": "false",
1010
"process.env.NODE_ENV": "\"production\"",
1111
global: "window"
1212
},
@@ -21,7 +21,7 @@ const testConfig = {
2121
bundle: true,
2222
entryPoints: ['test/browser/main.ts'],
2323
define: {
24-
"process.env.NODE_DEBUG": false,
24+
"process.env.NODE_DEBUG": "false",
2525
"process.env.NODE_ENV": "\"production\"",
2626
global: "window"
2727
},
@@ -61,6 +61,7 @@ Promise.all(targets.map(esbuild.build))
6161
})
6262
.catch(e => {
6363
console.error("Finished with errors: ", e.toString());
64+
process.exit(1);
6465
});
6566

6667

lib/reader.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,7 @@ function decodeStatisticsValue(value: any, column: ParquetField | Options) {
720720
}
721721

722722
if (column.originalType) {
723-
value = parquet_types.fromPrimitive(column.originalType, value);
723+
value = parquet_types.fromPrimitive(column.originalType, value, column);
724724
}
725725
return value;
726726
}

lib/schema.ts

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP
146146
nameWithPath = `${path}.${nameWithPath}`
147147
}
148148

149-
const typeDef = opts.type ? parquet_types.PARQUET_LOGICAL_TYPES[opts.type] : undefined;
149+
const typeDef = opts.type ? parquet_types.getParquetTypeDataObject(opts.type, opts) : undefined;
150150
if (!typeDef) {
151151
fieldErrors.push(`Invalid parquet type: ${(opts.type || "missing type")}, for Column: ${nameWithPath}`);
152152
continue;
@@ -172,7 +172,7 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP
172172
if (typeDef.originalType === 'DECIMAL') {
173173
// Default scale to 0 per https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
174174
if (typeof opts.scale === "undefined") opts.scale = 0;
175-
fieldErrors = fieldErrors.concat(errorsForDecimalOpts(typeDef.originalType, opts, nameWithPath));
175+
fieldErrors = fieldErrors.concat(errorsForDecimalOpts(typeDef.originalType, typeDef.primitiveType, opts, nameWithPath));
176176
}
177177

178178
/* add to schema */
@@ -219,7 +219,7 @@ function isDefined<T>(val: T | undefined): val is T {
219219
return val !== undefined;
220220
}
221221

222-
function errorsForDecimalOpts(type: string, opts: FieldDefinition, columnName: string): string[] {
222+
function errorsForDecimalOpts(type: string, primitiveType: string | undefined, opts: FieldDefinition, columnName: string): string[] {
223223
const fieldErrors = []
224224
if(opts.precision === undefined || opts.precision < 1) {
225225
fieldErrors.push(
@@ -231,9 +231,9 @@ function errorsForDecimalOpts(type: string, opts: FieldDefinition, columnName: s
231231
`invalid schema for type: ${type}, for Column: ${columnName}, precision must be an integer`
232232
);
233233
}
234-
else if (opts.precision > 18) {
234+
else if (primitiveType === "INT64" && opts.precision > 18) {
235235
fieldErrors.push(
236-
`invalid schema for type: ${type}, for Column: ${columnName}, can not handle precision over 18`
236+
`invalid schema for type: ${type} and primitive type: ${primitiveType} for Column: ${columnName}, can not handle precision over 18`
237237
);
238238
}
239239
if (typeof opts.scale === "undefined" || opts.scale < 0) {
@@ -246,8 +246,7 @@ function errorsForDecimalOpts(type: string, opts: FieldDefinition, columnName: s
246246
`invalid schema for type: ${type}, for Column: ${columnName}, scale must be an integer`
247247
);
248248
}
249-
// Default precision to 18 if it is undefined as that is a different error
250-
else if (opts.scale > (opts.precision || 18)) {
249+
else if (opts.precision !== undefined && opts.scale > opts.precision) {
251250
fieldErrors.push(
252251
`invalid schema or precision for type: ${type}, for Column: ${columnName}, precision must be greater than or equal to scale`
253252
);

lib/shred.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ function shredRecordInternal(fields: Record<string, ParquetField>, record: Recor
151151
field.dLevelMax);
152152
} else {
153153
data[path].distinct_values!.add(values[i]);
154-
data[path].values!.push(parquet_types.toPrimitive(fieldType as string, values[i]));
154+
data[path].values!.push(parquet_types.toPrimitive(fieldType as string, values[i], field));
155155
data[path].rlevels!.push(rlvl_i);
156156
data[path].dlevels!.push(field.dLevelMax);
157157
data[path].count! += 1;
@@ -205,7 +205,8 @@ export const materializeRecords = function(schema: ParquetSchema, buffer: Record
205205
if (dLevel === field.dLevelMax) {
206206
value = parquet_types.fromPrimitive(
207207
field.originalType || field.primitiveType,
208-
values.next().value);
208+
values.next().value,
209+
field);
209210
}
210211

211212
records[rLevels[0] - 1] = records[rLevels[0] - 1] || {};

lib/types.ts

Lines changed: 84 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,84 @@
11
'use strict';
22
// Thanks to https://github.com/kbajalc/parquets for some of the code.
33
import * as BSON from "bson"
4-
import { PrimitiveType, OriginalType, ParquetType } from "./declare"
5-
6-
type ParquetTypeData = {
7-
[Property in ParquetType]: {
8-
primitiveType?: PrimitiveType,
9-
toPrimitive: Function,
10-
fromPrimitive?: Function,
11-
originalType?: OriginalType,
12-
typeLength?: number
13-
}
14-
}
4+
import { PrimitiveType, OriginalType, ParquetType, FieldDefinition, ParquetField } from "./declare"
5+
import { Options } from "./codec/types";
6+
7+
type ParquetTypeDataObject = {
8+
primitiveType?: PrimitiveType,
9+
toPrimitive: Function,
10+
fromPrimitive?: Function,
11+
originalType?: OriginalType,
12+
typeLength?: number
13+
};
1514

1615
interface INTERVAL {
1716
months: number,
1817
days: number,
1918
milliseconds: number
2019
}
2120

22-
export const PARQUET_LOGICAL_TYPES: ParquetTypeData = {
21+
export function getParquetTypeDataObject(type: ParquetType, field?: ParquetField | Options | FieldDefinition): ParquetTypeDataObject {
22+
if (type === 'DECIMAL') {
23+
if (field?.typeLength !== undefined) {
24+
return {
25+
primitiveType: 'FIXED_LEN_BYTE_ARRAY',
26+
originalType: 'DECIMAL',
27+
typeLength: field.typeLength,
28+
toPrimitive: toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL
29+
};
30+
} else if (field?.precision !== undefined && field.precision > 18) {
31+
return {
32+
primitiveType: 'BYTE_ARRAY',
33+
originalType: 'DECIMAL',
34+
typeLength: field.typeLength,
35+
toPrimitive: toPrimitive_BYTE_ARRAY_DECIMAL
36+
};
37+
} else {
38+
return {
39+
primitiveType: 'INT64',
40+
originalType: 'DECIMAL',
41+
toPrimitive: toPrimitive_INT64
42+
};
43+
}
44+
} else {
45+
return PARQUET_LOGICAL_TYPE_DATA[type];
46+
}
47+
}
48+
49+
const PARQUET_LOGICAL_TYPES = new Set<string>([
50+
'BOOLEAN',
51+
'INT32',
52+
'INT64',
53+
'INT96',
54+
'FLOAT',
55+
'DOUBLE',
56+
'BYTE_ARRAY',
57+
'FIXED_LEN_BYTE_ARRAY',
58+
'UTF8',
59+
'ENUM',
60+
'TIME_MILLIS',
61+
'TIME_MICROS',
62+
'DATE',
63+
'TIMESTAMP_MILLIS',
64+
'TIMESTAMP_MICROS',
65+
'UINT_8',
66+
'UINT_16',
67+
'UINT_32',
68+
'UINT_64',
69+
'INT_8',
70+
'INT_16',
71+
'INT_32',
72+
'INT_64',
73+
'DECIMAL',
74+
'JSON',
75+
'BSON',
76+
'INTERVAL',
77+
'MAP',
78+
'LIST'
79+
] satisfies ParquetType[])
80+
81+
const PARQUET_LOGICAL_TYPE_DATA: { [logicalType: string]: ParquetTypeDataObject } = {
2382
'BOOLEAN': {
2483
primitiveType: 'BOOLEAN',
2584
toPrimitive: toPrimitive_BOOLEAN,
@@ -133,11 +192,6 @@ export const PARQUET_LOGICAL_TYPES: ParquetTypeData = {
133192
originalType: 'INT_64',
134193
toPrimitive: toPrimitive_INT64
135194
},
136-
'DECIMAL': {
137-
primitiveType: 'INT64',
138-
originalType: 'DECIMAL',
139-
toPrimitive: toPrimitive_INT64
140-
},
141195
'JSON': {
142196
primitiveType: 'BYTE_ARRAY',
143197
originalType: 'JSON',
@@ -173,31 +227,30 @@ export const PARQUET_LOGICAL_TYPES: ParquetTypeData = {
173227
* @returns if type is a valid Parquet Type
174228
*/
175229
function isParquetType(type: string | undefined): type is ParquetType {
176-
return type !== undefined && (type in PARQUET_LOGICAL_TYPES);
230+
return type !== undefined && PARQUET_LOGICAL_TYPES.has(type);
177231
}
178232

179233
/**
180234
* Convert a value from it's native representation to the internal/underlying
181235
* primitive type
182236
*/
183-
export function toPrimitive(type: string | undefined, value: unknown) {
237+
export function toPrimitive(type: string | undefined, value: unknown, field?: ParquetField | Options) {
184238
if (!isParquetType(type)) {
185239
throw 'invalid type: ' + type || "undefined";
186240
}
187-
188-
return PARQUET_LOGICAL_TYPES[type].toPrimitive(value);
241+
return getParquetTypeDataObject(type, field).toPrimitive(value);
189242
}
190243

191244
/**
192245
* Convert a value from it's internal/underlying primitive representation to
193246
* the native representation
194247
*/
195-
export function fromPrimitive(type: string | undefined, value: unknown) {
248+
export function fromPrimitive(type: string | undefined, value: unknown, field?: ParquetField | Options) {
196249
if (!isParquetType(type)) {
197250
throw 'invalid type: ' + type || "undefined";
198251
}
199252

200-
const typeFromPrimitive = PARQUET_LOGICAL_TYPES[type].fromPrimitive
253+
const typeFromPrimitive = getParquetTypeDataObject(type, field).fromPrimitive
201254
if (typeFromPrimitive !== undefined) {
202255
return typeFromPrimitive(value)
203256
} else {
@@ -350,6 +403,14 @@ function toPrimitive_INT96(value: number | bigint | string) {
350403
}
351404
}
352405

406+
function toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL(value: Array<number>) {
407+
return Buffer.from(value);
408+
}
409+
410+
function toPrimitive_BYTE_ARRAY_DECIMAL(value: Array<number>) {
411+
return Buffer.from(value);
412+
}
413+
353414
function toPrimitive_MAP(value: any) {
354415
return value;
355416
}

lib/writer.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import * as parquet_codec from './codec'
66
import * as parquet_compression from './compression'
77
import * as parquet_types from './types'
88
import * as bloomFilterWriter from "./bloomFilterIO/bloomFilterWriter"
9-
import { WriterOptions, ParquetCodec, ParquetField, ColumnMetaDataExt, RowGroupExt, Page } from './declare'
9+
import { WriterOptions, ParquetCodec, ParquetField, ColumnMetaDataExt, RowGroupExt, Page, FieldDefinition } from './declare'
1010
import { Options } from './codec/types'
1111
import { ParquetSchema } from './schema'
1212
import Int64 from 'node-int64'
@@ -386,7 +386,7 @@ function encodeStatisticsValue(value: any, column: ParquetField | Options) {
386386
return Buffer.alloc(0);
387387
}
388388
if (column.originalType) {
389-
value = parquet_types.toPrimitive(column.originalType,value);
389+
value = parquet_types.toPrimitive(column.originalType, value, column);
390390
}
391391
if (column.primitiveType !== 'BYTE_ARRAY') {
392392
value = encodeValues(column.primitiveType!,'PLAIN',[value],column);

0 commit comments

Comments
 (0)