diff --git a/package.json b/package.json index 32dd329..f10e249 100644 --- a/package.json +++ b/package.json @@ -26,15 +26,14 @@ "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", - "@types/parquetjs": "^0.10.6", "@types/progress-stream": "^2.0.5", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "axios": "^1.7.7", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", + "hyparquet": "^1.6.3", "node-fetch": "^2.6.7", - "parquetjs": "^0.11.2", "progress-stream": "^2.0.0" }, "devDependencies": { diff --git a/src/lib/upload.ts b/src/lib/upload.ts index 0978f02..7b7f718 100644 --- a/src/lib/upload.ts +++ b/src/lib/upload.ts @@ -7,8 +7,7 @@ import fetch from 'node-fetch'; import * as path from 'path'; import progress from 'progress-stream'; import readline from 'readline'; -import pkg from 'parquetjs'; -const { ParquetReader } = pkg; +import { asyncBufferFromFile, parquetMetadataAsync, parquetSchema, SchemaTree } from 'hyparquet'; export interface FileResponse { id: string; @@ -76,16 +75,16 @@ export async function check_file(fileName: string): Promise { export async function check_parquet(fileName: string): Promise { try { - const reader = await ParquetReader.openFile(fileName); - const cursor = reader.getCursor(); - let record = null; + const asyncBuffer = await asyncBufferFromFile(fileName); + const metadata = await parquetMetadataAsync(asyncBuffer); + const { children } = parquetSchema(metadata); - const fieldNames = Object.keys(reader.schema.fields); - if (!('input_ids' in fieldNames)) { + const fieldNames = children.map((child: SchemaTree) => child.element.name); + if (!fieldNames.includes('input_ids')) { return `Parquet file ${fileName} does not contain the 'input_ids' column.`; } - for (const fieldName in fieldNames) { + for (const fieldName of fieldNames) { if (!PARQUET_EXPECTED_COLUMNS.includes(fieldName)) { return `Parquet file ${fileName} contains unexpected column ${fieldName}. Only ${PARQUET_EXPECTED_COLUMNS.join( ', ', @@ -93,12 +92,10 @@ export async function check_parquet(fileName: string): Promise= 2.2.3" - through2@~2.0.3: version "2.0.5" resolved "https://registry.yarnpkg.com/through2/-/through2-2.0.5.tgz#01c1e39eb31d07cb7d03a96a70823260b23132cd" @@ -3550,11 +3468,6 @@ v8-to-istanbul@^9.0.1: "@types/istanbul-lib-coverage" "^2.0.1" convert-source-map "^2.0.0" -varint@^5.0.0: - version "5.0.2" - resolved "https://registry.yarnpkg.com/varint/-/varint-5.0.2.tgz#5b47f8a947eb668b848e034dcfa87d0ff8a7f7a4" - integrity sha512-lKxKYG6H03yCZUpAGOPOsMcGxd1RHCu1iKvEHYDPmTyq2HueGhD73ssNBqqQWfvYs04G9iUFRvmAVLW20Jw6ow== - walker@^1.0.8: version "1.0.8" resolved "https://registry.yarnpkg.com/walker/-/walker-1.0.8.tgz#bd498db477afe573dc04185f011d3ab8a8d7653f" @@ -3609,11 +3522,6 @@ write-file-atomic@^4.0.2: imurmurhash "^0.1.4" signal-exit "^3.0.7" -"ws@>= 2.2.3": - version "8.18.0" - resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.0.tgz#0d7505a6eafe2b0e712d232b42279f53bc289bbc" - integrity sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw== - xtend@~4.0.1: version "4.0.2" resolved "https://registry.yarnpkg.com/xtend/-/xtend-4.0.2.tgz#bb72779f5fa465186b1f438f674fa347fdb5db54"