Skip to content

Commit 19f3ffa

Browse files
authored
Decimal Writer Support (#90)
Problem ======= Need to support writing decimal types Also closes: #87 Solution ======== Add basic encoding support for decimals Change summary: --------------- * Added better decimal field errors * Default scale to 0 per spec * Cleanup types on RLE so it only asks for what is needed * Support decimal encoding * Add test for write / read of decimal Steps to Verify: ---------------- 1. Generate a schema with decimal field 2. Use it!
1 parent fa1865b commit 19f3ffa

10 files changed

+174
-44
lines changed

lib/codec/plain.ts

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,16 @@ function decodeValues_BOOLEAN(cursor: Cursor, count: number) {
2626
return values;
2727
}
2828

29-
function encodeValues_INT32(values: Array<number>) {
29+
function encodeValues_INT32(values: Array<number>, opts: Options) {
30+
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL';
31+
const scale = opts?.scale || 0;
3032
let buf = Buffer.alloc(4 * values.length);
3133
for (let i = 0; i < values.length; i++) {
32-
buf.writeInt32LE(values[i], i * 4);
34+
if (isDecimal) {
35+
buf.writeInt32LE(values[i] * Math.pow(10, scale), i * 4);
36+
} else {
37+
buf.writeInt32LE(values[i], i * 4);
38+
}
3339
}
3440

3541
return buf;
@@ -55,10 +61,16 @@ function decodeValues_INT32(cursor: Cursor, count: number, opts: Options) {
5561
return values;
5662
}
5763

58-
function encodeValues_INT64(values: Array<number>) {
64+
function encodeValues_INT64(values: Array<number>, opts: Options) {
65+
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL';
66+
const scale = opts?.scale || 0;
5967
let buf = Buffer.alloc(8 * values.length);
6068
for (let i = 0; i < values.length; i++) {
61-
buf.writeBigInt64LE(BigInt(values[i]), i * 8);
69+
if (isDecimal) {
70+
buf.writeBigInt64LE(BigInt(Math.floor(values[i] * Math.pow(10, scale))), i * 8);
71+
} else {
72+
buf.writeBigInt64LE(BigInt(values[i]), i * 8);
73+
}
6274
}
6375

6476
return buf;
@@ -86,15 +98,11 @@ function decodeValues_INT64(cursor: Cursor, count: number, opts: Options) {
8698
}
8799

88100
function decodeValues_DECIMAL(cursor: Cursor, count: number, opts: Options) {
89-
let {
90-
scale,
91-
precision
92-
} = opts;
101+
const precision = opts.precision;
102+
// Default scale to 0 per spec
103+
const scale = opts.scale || 0;
93104

94105
const name = opts.name || undefined
95-
if (!scale) {
96-
throw `missing option: scale (required for DECIMAL) for column: ${name}`;
97-
}
98106
if (!precision) {
99107
throw `missing option: precision (required for DECIMAL) for column: ${name}`;
100108
}
@@ -283,10 +291,10 @@ export const encodeValues = function (
283291
return encodeValues_BOOLEAN(values as Array<boolean>);
284292

285293
case "INT32":
286-
return encodeValues_INT32(values as Array<number>);
294+
return encodeValues_INT32(values as Array<number>, opts);
287295

288296
case "INT64":
289-
return encodeValues_INT64(values as Array<number>);
297+
return encodeValues_INT64(values as Array<number>, opts);
290298

291299
case "INT96":
292300
return encodeValues_INT96(values as Array<number>);

lib/codec/plain_dictionary.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import * as rle from './rle'
22
import { Cursor, Options } from './types'
33

44
export const decodeValues = function(type: string, cursor: Cursor, count: number, opts: Options) {
5-
opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset+1).readInt8(0);
5+
const bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset+1).readInt8(0);
66
cursor.offset += 1;
7-
return rle.decodeValues(type, cursor, count, Object.assign({}, opts, {disableEnvelope: true}));
7+
return rle.decodeValues(type, cursor, count, Object.assign({}, opts, { disableEnvelope: true, bitWidth }));
88
};

lib/codec/rle.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
// https://github.com/apache/parquet-format/blob/master/Encodings.md
44

55
import varint from 'varint'
6-
import {Cursor, Options} from './types'
6+
import { Cursor } from './types'
77

8-
function encodeRunBitpacked(values: Array<number>, opts: Options) {
8+
function encodeRunBitpacked(values: Array<number>, opts: { bitWidth: number }) {
99
for (let i = 0; i < values.length % 8; i++) {
1010
values.push(0);
1111
}
@@ -23,7 +23,7 @@ function encodeRunBitpacked(values: Array<number>, opts: Options) {
2323
]);
2424
}
2525

26-
function encodeRunRepeated(value: number, count: number, opts: Options) {
26+
function encodeRunRepeated(value: number, count: number, opts: { bitWidth: number }) {
2727
let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8));
2828
let remainingValue = value
2929

@@ -48,7 +48,7 @@ function unknownToParsedInt(value: string | number) {
4848
}
4949
}
5050

51-
export const encodeValues = function(type: string, values: Array<number>, opts: Options) {
51+
export const encodeValues = function(type: string, values: Array<number>, opts: { bitWidth: number, disableEnvelope?: boolean }) {
5252
if (!('bitWidth' in opts)) {
5353
throw 'bitWidth is required';
5454
}
@@ -108,7 +108,7 @@ export const encodeValues = function(type: string, values: Array<number>, opts:
108108
return envelope;
109109
};
110110

111-
function decodeRunBitpacked(cursor : Cursor, count: number, opts: Options) {
111+
function decodeRunBitpacked(cursor : Cursor, count: number, opts: { bitWidth: number }) {
112112
if (count % 8 !== 0) {
113113
throw 'must be a multiple of 8';
114114
}
@@ -124,7 +124,7 @@ function decodeRunBitpacked(cursor : Cursor, count: number, opts: Options) {
124124
return values;
125125
}
126126

127-
function decodeRunRepeated(cursor: Cursor, count: number, opts: Options) {
127+
function decodeRunRepeated(cursor: Cursor, count: number, opts: { bitWidth: number }) {
128128
var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8);
129129
let value = 0;
130130

@@ -139,7 +139,7 @@ function decodeRunRepeated(cursor: Cursor, count: number, opts: Options) {
139139
return new Array(count).fill(value);
140140
}
141141

142-
export const decodeValues = function(_: string, cursor: Cursor, count: number, opts: Options) {
142+
export const decodeValues = function(_: string, cursor: Cursor, count: number, opts: { bitWidth: number, disableEnvelope?: boolean }) {
143143
if (!('bitWidth' in opts)) {
144144
throw 'bitWidth is required';
145145
}

lib/codec/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { Statistics } from "../../gen-nodejs/parquet_types";
55
export interface Options {
66
typeLength: number,
77
bitWidth: number,
8-
disableEnvelope: boolean
8+
disableEnvelope?: boolean
99
primitiveType?: PrimitiveType;
1010
originalType?: OriginalType;
1111
encoding?: ParquetCodec;

lib/reader.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ const {
2222
const PARQUET_MAGIC = 'PAR1';
2323

2424
/**
25-
* Parquet File Format Version
25+
* Supported Parquet File Format Version for reading
2626
*/
27-
const PARQUET_VERSION = 1;
27+
const PARQUET_VERSIONS = [1, 2];
2828

2929
/**
3030
* Internal type used for repetition/definition levels
@@ -166,7 +166,7 @@ export class ParquetReader {
166166
*/
167167
constructor(metadata: FileMetaDataExt, envelopeReader: ParquetEnvelopeReader, opts?: BufferReaderOptions) {
168168
opts = opts || {};
169-
if (metadata.version != PARQUET_VERSION) {
169+
if (!PARQUET_VERSIONS.includes(metadata.version)) {
170170
throw 'invalid parquet version';
171171
}
172172

@@ -1021,8 +1021,8 @@ async function decodeDataPageV2(cursor: Cursor, header: parquet_thrift.PageHeade
10211021
valuesBufCursor,
10221022
valueCountNonNull,
10231023
{
1024-
typeLength: opts.column!.typeLength!,
1025-
bitWidth: opts.column!.typeLength!
1024+
bitWidth: opts.column!.typeLength!,
1025+
...opts.column!
10261026
});
10271027

10281028
return {

lib/schema.ts

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP
170170
}
171171

172172
if (typeDef.originalType === 'DECIMAL') {
173+
// Default scale to 0 per https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
174+
if (typeof opts.scale === "undefined") opts.scale = 0;
173175
fieldErrors = fieldErrors.concat(errorsForDecimalOpts(typeDef.originalType, opts, nameWithPath));
174176
}
175177

@@ -219,19 +221,35 @@ function isDefined<T>(val: T | undefined): val is T {
219221

220222
function errorsForDecimalOpts(type: string, opts: FieldDefinition, columnName: string): string[] {
221223
const fieldErrors = []
222-
if(!opts.precision) {
224+
if(opts.precision === undefined || opts.precision < 1) {
223225
fieldErrors.push(
224-
`invalid schema for type: ${type}, for Column: ${columnName}, precision is required`
226+
`invalid schema for type: ${type}, for Column: ${columnName}, precision is required and must be be greater than 0`
227+
);
228+
}
229+
else if (!Number.isInteger(opts.precision)) {
230+
fieldErrors.push(
231+
`invalid schema for type: ${type}, for Column: ${columnName}, precision must be an integer`
225232
);
226233
}
227234
else if (opts.precision > 18) {
228235
fieldErrors.push(
229-
`invalid precision for type: ${type}, for Column: ${columnName}, can not handle precision over 18`
236+
`invalid schema for type: ${type}, for Column: ${columnName}, can not handle precision over 18`
237+
);
238+
}
239+
if (typeof opts.scale === "undefined" || opts.scale < 0) {
240+
fieldErrors.push(
241+
`invalid schema for type: ${type}, for Column: ${columnName}, scale is required to be 0 or greater`
242+
);
243+
}
244+
else if (!Number.isInteger(opts.scale)) {
245+
fieldErrors.push(
246+
`invalid schema for type: ${type}, for Column: ${columnName}, scale must be an integer`
230247
);
231248
}
232-
if (!opts.scale) {
249+
// Default precision to 18 if it is undefined as that is a different error
250+
else if (opts.scale > (opts.precision || 18)) {
233251
fieldErrors.push(
234-
`invalid schema for type: ${type}, for Column: ${columnName}, scale is required`
252+
`invalid schema or precision for type: ${type}, for Column: ${columnName}, precision must be greater than or equal to scale`
235253
);
236254
}
237255
return fieldErrors

lib/writer.ts

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import stream from 'stream'
2-
import parquet_thrift from '../gen-nodejs/parquet_types'
2+
import parquet_thrift, { ConvertedType } from '../gen-nodejs/parquet_types'
33
import * as parquet_shredder from './shred'
44
import * as parquet_util from './util'
55
import * as parquet_codec from './codec'
@@ -487,8 +487,8 @@ async function encodeDataPage(column: ParquetField, values: number[], rlevels: n
487487
column.primitiveType!,
488488
column.encoding!,
489489
values, {
490-
typeLength: column.typeLength,
491-
bitWidth: column.typeLength
490+
bitWidth: column.typeLength,
491+
...column
492492
});
493493

494494
/* encode repetition and definition levels */
@@ -545,8 +545,8 @@ async function encodeDataPageV2(column: ParquetField, rowCount: number, values:
545545
column.primitiveType!,
546546
column.encoding!,
547547
values, {
548-
typeLength: column.typeLength,
549-
bitWidth: column.typeLength
548+
bitWidth: column.typeLength,
549+
...column,
550550
});
551551

552552
let valuesBufCompressed = await parquet_compression.deflate(
@@ -772,6 +772,14 @@ function encodeFooter(schema: ParquetSchema, rowCount: Int64, rowGroups: RowGrou
772772
schemaElem.converted_type = parquet_thrift.ConvertedType[field.originalType];
773773
}
774774

775+
// Support Decimal
776+
switch(schemaElem.converted_type) {
777+
case (ConvertedType.DECIMAL):
778+
schemaElem.precision = field.precision;
779+
schemaElem.scale = field.scale || 0;
780+
break;
781+
}
782+
775783
schemaElem.type_length = field.typeLength;
776784

777785
metadata.schema.push(schemaElem);

package-lock.json

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/integration.js

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,5 +576,61 @@ describe('Parquet', function() {
576576
);
577577
});
578578
});
579+
580+
describe('Decimal schema', function() {
581+
const schema = new parquet.ParquetSchema({
582+
zero_column: { type: 'DECIMAL', precision: 10, scale: 0 },
583+
no_scale_column: { type: 'DECIMAL', precision: 10 },
584+
scale_64_column: { type: 'DECIMAL', precision: 10, scale: 2 },
585+
scale_32_column: { type: 'DECIMAL', precision: 8, scale: 2 },
586+
});
587+
588+
const rowData = {
589+
zero_column: 1,
590+
no_scale_column: 2,
591+
scale_64_column: 3.345678901234567,
592+
scale_32_column: 3.3,
593+
};
594+
595+
it('write a test file with decimals in v1 data page and read it back', async function() {
596+
const file = "decimal-test-v1.parquet";
597+
const opts = { useDataPageV2: false };
598+
const writer = await parquet.ParquetWriter.openFile(schema, file, opts);
599+
600+
await writer.appendRow(rowData);
601+
await writer.close();
602+
603+
const reader = await parquet.ParquetReader.openFile(file);
604+
605+
const cursor = reader.getCursor();
606+
const row = await cursor.next();
607+
assert.deepEqual(row, {
608+
zero_column: 1,
609+
no_scale_column: 2,
610+
scale_64_column: 3.34, // Scale 2
611+
scale_32_column: 3.3,
612+
})
613+
});
614+
615+
it('write a test file with decimals in v2 data page and read it back', async function() {
616+
const file = "decimal-test-v2.parquet";
617+
const opts = { useDataPageV2: true };
618+
const writer = await parquet.ParquetWriter.openFile(schema, file, opts);
619+
620+
await writer.appendRow(rowData);
621+
await writer.close();
622+
623+
const reader = await parquet.ParquetReader.openFile(file);
624+
625+
const cursor = reader.getCursor();
626+
const row = await cursor.next();
627+
assert.deepEqual(row, {
628+
zero_column: 1,
629+
no_scale_column: 2,
630+
scale_64_column: 3.34, // Scale 2
631+
scale_32_column: 3.3,
632+
})
633+
});
634+
});
579635
});
580636

0 commit comments

Comments
 (0)