Skip to content

Commit b5698e4

Browse files
Feature: Parquet Schema from JSON Schema (#82) with
Problem ======= We want it to be easier to build Parquet files (and thus the schema for those files) from existing schemas. In this case JSON Schemas Solution ======== Change summary: --------------- * Added field creation helper functions * Added static function `ParquetSchema.fromJsonSchema` that takes in a Json Schema Steps to Verify: ---------------- 1. Examples are in the tests, but take a JSON schema and call `ParquetSchema.fromJsonSchema` with it and see what it does --------- Co-authored-by: Shannon Wells <[email protected]> With assistance from @noxify
1 parent 2c733b5 commit b5698e4

20 files changed

+4943
-516
lines changed

README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ Parquet files have a strict schema, similar to tables in a SQL database. So,
8787
in order to produce a Parquet file we first need to declare a new schema. Here
8888
is a simple example that shows how to instantiate a `ParquetSchema` object:
8989

90+
### Native Schema Definition
91+
9092
``` js
9193
// declare a schema for the `fruits` table
9294
var schema = new parquet.ParquetSchema({
@@ -98,6 +100,46 @@ var schema = new parquet.ParquetSchema({
98100
});
99101
```
100102

103+
### Helper Functions
104+
105+
```js
106+
var schema = new parquet.ParquetSchema({
107+
name: parquet.ParquetFieldBuilder.createStringField(),
108+
quantity: parquet.ParquetFieldBuilder.createIntField(64),
109+
price: parquet.ParquetFieldBuilder.createDoubleField(),
110+
date: parquet.ParquetFieldBuilder.createTimestampField(),
111+
in_stock: parquet.ParquetFieldBuilder.createBooleanField()
112+
});
113+
```
114+
115+
### JSON Schema
116+
117+
``` js
118+
// declare a schema for the `fruits` JSON Schema
119+
var schema = new parquet.ParquetSchema.fromJsonSchema({
120+
"type": "object",
121+
"properties": {
122+
"name": {
123+
"type": "string"
124+
},
125+
"quantity": {
126+
"type": "integer"
127+
},
128+
"price": {
129+
"type": "number"
130+
},
131+
"date": {
132+
"type": "string"
133+
},
134+
"in_stock": {
135+
"type": "boolean"
136+
}
137+
},
138+
"required": ["name", "quantity", "price", "date", "in_stock"]
139+
});
140+
```
141+
142+
101143
Note that the Parquet schema supports nesting, so you can store complex, arbitrarily
102144
nested records into a single row (more on that later) while still maintaining good
103145
compression.

lib/fields.ts

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Helper functions for creating fields
2+
3+
import { FieldDefinition, ParquetType, SchemaDefinition } from "./declare";
4+
5+
export function createStringField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition {
6+
return { ...fieldOptions, optional, type: 'UTF8' };
7+
}
8+
9+
export function createBooleanField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition {
10+
return { ...fieldOptions, optional, type: 'BOOLEAN' };
11+
}
12+
13+
export function createIntField(size: 32 | 64, optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition {
14+
return { ...fieldOptions, optional, type: `INT${size}` };
15+
}
16+
17+
export function createFloatField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition {
18+
return { ...fieldOptions, optional, type: 'FLOAT' };
19+
}
20+
21+
export function createDoubleField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition {
22+
return { ...fieldOptions, optional, type: 'DOUBLE' };
23+
}
24+
25+
export function createDecimalField(precision: number, optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition {
26+
return { ...fieldOptions, precision, optional, type: 'FLOAT' };
27+
}
28+
29+
export function createTimestampField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition {
30+
return { ...fieldOptions, optional, type: 'TIMESTAMP_MILLIS' };
31+
}
32+
33+
export function createStructField(fields: SchemaDefinition, optional = true): FieldDefinition {
34+
return {
35+
optional,
36+
fields,
37+
}
38+
}
39+
40+
export function createStructListField(fields: SchemaDefinition, optional = true): FieldDefinition {
41+
return {
42+
type: 'LIST',
43+
optional,
44+
fields: {
45+
list: {
46+
repeated: true,
47+
fields: {
48+
element: {
49+
fields,
50+
},
51+
}
52+
},
53+
},
54+
}
55+
}
56+
57+
export function createListField(type: ParquetType, optional = true, elementOptions: FieldDefinition = { optional: true }): FieldDefinition {
58+
return {
59+
type: 'LIST',
60+
optional,
61+
fields: {
62+
list: {
63+
repeated: true,
64+
fields: {
65+
element: {
66+
optional: true,
67+
...elementOptions,
68+
type,
69+
},
70+
},
71+
},
72+
},
73+
}
74+
}

lib/jsonSchema.ts

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
// Support json-schema.org schema conversion to a Parquet file
2+
import { JSONSchema4 } from 'json-schema';
3+
import { FieldDefinition, SchemaDefinition } from './declare';
4+
import * as fields from './fields';
5+
6+
type SupportedJSONSchema4 = Omit<JSONSchema4, '$ref' | 'multipleOf' | 'allOf' | 'anyOf' | 'oneOf' | 'not' | 'additionalItems' | 'enum' | 'extends'>
7+
8+
/**
9+
* Simple check to make sure that `SupportedJSONSchema4` is correct.
10+
* There are a lot of JSON schema stuff we just don't support for now.
11+
*/
12+
const isJsonSchemaSupported = (js: JSONSchema4): js is SupportedJSONSchema4 => {
13+
const unsupportedFields = [
14+
"$ref",
15+
"multipleOf",
16+
"allOf",
17+
"anyOf",
18+
"oneOf",
19+
"not",
20+
"additionalItems",
21+
"enum",
22+
"extends",
23+
];
24+
for (const field in unsupportedFields) {
25+
if (!(js[field] === undefined || js[field] === false)) {
26+
return false;
27+
}
28+
}
29+
return true;
30+
}
31+
32+
/**
33+
* Error to capture all the unsupported edge cases
34+
*/
35+
export class UnsupportedJsonSchemaError extends Error {
36+
constructor(msg: string) {
37+
const message = `Unsupported JSON schema: ${msg}`;
38+
super(message);
39+
this.name = 'UnsupportedJsonSchemaError';
40+
}
41+
}
42+
43+
/**
44+
* Json Schema has required at the top level instead of field level
45+
*/
46+
const isJsonSchemaRequired = (jsonSchema: SupportedJSONSchema4) => (field: string): boolean => {
47+
switch (jsonSchema.required) {
48+
case true: return true;
49+
case undefined:
50+
case false:
51+
return false;
52+
}
53+
54+
return jsonSchema.required.includes(field);
55+
}
56+
57+
/**
58+
* Converts the Array field type into the correct Field Definition
59+
*/
60+
const fromJsonSchemaArray = (fieldValue: SupportedJSONSchema4, optionalFieldList: boolean): FieldDefinition => {
61+
if (!fieldValue.items || !fieldValue.items.type) {
62+
throw new UnsupportedJsonSchemaError("Array field with no values found.");
63+
}
64+
65+
switch (fieldValue.items.type) {
66+
case 'string':
67+
return fields.createListField('UTF8', optionalFieldList);
68+
case 'integer':
69+
case 'number':
70+
return fields.createListField('INT64', optionalFieldList);
71+
case 'boolean':
72+
return fields.createListField('BOOLEAN', optionalFieldList);
73+
case 'object':
74+
return fields.createStructListField(fromJsonSchema(fieldValue.items), optionalFieldList);
75+
default:
76+
throw new UnsupportedJsonSchemaError(`Array field type ${JSON.stringify(fieldValue.items)} is unsupported.`);
77+
}
78+
}
79+
80+
/**
81+
* Converts a field from a JSON Schema into a Parquet Field Definition
82+
*/
83+
const fromJsonSchemaField = (jsonSchema: JSONSchema4) => (fieldName: string, fieldValue: JSONSchema4): FieldDefinition => {
84+
if (!isJsonSchemaSupported(fieldValue)) {
85+
throw new UnsupportedJsonSchemaError(`Field: ${fieldName} has an unsupported schema`);
86+
}
87+
const optional = !isJsonSchemaRequired(jsonSchema)(fieldName);
88+
89+
switch (fieldValue.type) {
90+
case 'string':
91+
return fields.createStringField(optional);
92+
case 'integer':
93+
case 'number':
94+
return fields.createIntField(64, optional);
95+
case 'boolean':
96+
return fields.createBooleanField(optional);
97+
case 'array':
98+
return fromJsonSchemaArray(fieldValue, optional);
99+
case 'object':
100+
return fields.createStructField(fromJsonSchema(fieldValue), optional);
101+
default:
102+
throw new UnsupportedJsonSchemaError(
103+
`Unable to convert "${fieldName}" with JSON Schema type "${fieldValue.type}" to a Parquet Schema.`,
104+
)
105+
}
106+
}
107+
108+
/**
109+
* Converts supported Json Schemas into Parquet Schema Definitions
110+
*/
111+
export const fromJsonSchema = (jsonSchema: JSONSchema4): SchemaDefinition => {
112+
if (!isJsonSchemaSupported(jsonSchema)) {
113+
throw new UnsupportedJsonSchemaError("Unsupported fields found");
114+
}
115+
116+
const schema: SchemaDefinition = {};
117+
118+
const fromField = fromJsonSchemaField(jsonSchema)
119+
120+
for (const [fieldName, fieldValue] of Object.entries(
121+
jsonSchema.properties || {},
122+
)) {
123+
schema[fieldName] = fromField(fieldName, fieldValue);
124+
}
125+
126+
return schema;
127+
}

lib/schema.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@ import * as parquet_codec from './codec';
22
import * as parquet_compression from './compression'
33
import * as parquet_types from './types'
44
import { SchemaDefinition, ParquetField, RepetitionType, FieldDefinition } from './declare'
5+
import { JSONSchema4 } from 'json-schema'
6+
import { fromJsonSchema } from './jsonSchema';
57

68
const PARQUET_COLUMN_KEY_SEPARATOR = '.';
79

10+
811
/**
912
* A parquet file schema
1013
*/
@@ -13,6 +16,14 @@ export class ParquetSchema {
1316
fields: Record<string, ParquetField>
1417
fieldList: Array<ParquetField>
1518

19+
/**
20+
* Create a new schema from JSON Schema (json-schema.org)
21+
*/
22+
static fromJsonSchema(jsonSchema: JSONSchema4) {
23+
const schema: SchemaDefinition = fromJsonSchema(jsonSchema);
24+
return new ParquetSchema(schema);
25+
}
26+
1627
/**
1728
* Create a new schema from a JSON schema definition
1829
*/

0 commit comments

Comments
 (0)