Skip to content

Commit 6797c99

Browse files
authored
Fix Brotli compression (#144)
Note: The reference Brotli file doesn't work because it is too large. It decompresses to an extremely large size (1Gb+) Closes #140 Closes #125 (Likely?) A new brotli sample file was generated using python code seen below. ## What Changed? - Moved the esbuild work to mjs. This was required to get some of the builds working correctly - Fixed bug in the esbuild when it tried to build the browser code in parallel with the test code causing a race condition - Split the compression.ts file into a browser version and a node version - Swapped over to use `esbuild-plugin-wat` which worked better than the copy-pasted one from esbuild - Integrated the brotli-wasm correctly for browser, but used brotli natively in nodejs ## Testing! - There is a nodejs test for the node version - There is a browser test for it as well: 1. `npm i` 2. `npm run build:browser` 3. `npx serve .` 4. `open http://localhost:3000/test/browser/` in your preferred browser - The example server also has it: - cd examples/server/ - npm i - node app.js - cd ../../ && npm run serve - `open http://localhost:3000/` in your preferred browser ### Brotli Sample File generation script ```python import pandas as pd import pyarrow as pa import pyarrow.parquet as pq # Create a small sample DataFrame data = { 'id': [1, 2, 3, 4, 5], 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 'age': [25, 30, 35, 40, 45] } df = pd.DataFrame(data) # Convert DataFrame to PyArrow Table table = pa.Table.from_pandas(df) # Define output Parquet file path output_file = "sample_brotli_compressed.parquet" # Write to Parquet file with Brotli compression pq.write_table(table, output_file, compression='BROTLI') print(f"File {output_file} created successfully!") ``
1 parent 540e0e5 commit 6797c99

24 files changed

+903
-933
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@ dist
88
!test/reference-test/files/*.parquet
99
examples/server/package-lock.json
1010
test/browser/*.js
11+
main.js
12+
main.js.map

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,15 @@ var writer = await parquet.ParquetWriter.openFile(schema, 'fruits.parquet');
484484
writer.setRowGroupSize(8192);
485485
```
486486

487+
## Browser Tests
488+
489+
To run the browser tests (folder: `test/browser`) in a specific browser:
490+
491+
1. `npm i`
492+
2. `npm run build:browser`
493+
3. `npx serve .`
494+
4. `open http://localhost:3000/test/browser/` in your preferred browser (Trailing `/` is required)
495+
487496
## Dependencies
488497

489498
Parquet uses [thrift](https://thrift.apache.org/) to encode the schema and other

esbuild-plugins.js

Lines changed: 0 additions & 73 deletions
This file was deleted.

esbuild-plugins.mjs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import path from 'node:path';
2+
/**
3+
* this plugin resolves to a browser version of compression.ts that uses different code for browsers
4+
*/
5+
export const compressionBrowserPlugin = {
6+
name: 'compressionBrowser',
7+
setup(build) {
8+
build.onResolve({ filter: /^\.\/compression$/ }, (args) => {
9+
return { path: path.join(args.resolveDir, args.path.replace('compression', 'browser/compression.ts')) };
10+
});
11+
},
12+
};

esbuild-serve.js renamed to esbuild-serve.mjs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,27 @@
33
* It attaches the parquet.js exports to a "parquetjs" global variable.
44
* See the example server for how to use it.
55
*/
6-
const { compressionBrowserPlugin, wasmPlugin } = require('./esbuild-plugins');
6+
import { compressionBrowserPlugin } from './esbuild-plugins.mjs';
7+
import watPlugin from 'esbuild-plugin-wat';
8+
import esbuild from 'esbuild';
79
// esbuild has TypeScript support by default. It will use .tsconfig
8-
require('esbuild')
10+
esbuild
911
.context({
1012
entryPoints: ['parquet.ts'],
1113
outfile: 'main.js',
1214
define: { 'process.env.NODE_DEBUG': 'false', 'process.env.NODE_ENV': '"production"', global: 'window' },
1315
platform: 'browser',
14-
plugins: [compressionBrowserPlugin, wasmPlugin],
16+
plugins: [compressionBrowserPlugin, watPlugin()],
1517
sourcemap: 'external',
1618
bundle: true,
19+
minify: false,
1720
globalName: 'parquetjs',
18-
inject: ['./esbuild-shims.js'],
21+
inject: ['./esbuild-shims.mjs'],
1922
})
2023
.then((context) => {
2124
context
2225
.serve({
23-
servedir: __dirname,
26+
servedir: './',
2427
})
2528
.then((server) => {
2629
console.log('serving parquetjs', server);

esbuild-shims.js

Lines changed: 0 additions & 2 deletions
This file was deleted.

esbuild-shims.mjs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
import { Buffer as buffer } from 'buffer/';
2+
export let Buffer = buffer;

esbuild.js renamed to esbuild.mjs

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
const esbuild = require('esbuild');
2-
const path = require('path');
3-
const { compressionBrowserPlugin, wasmPlugin } = require('./esbuild-plugins');
1+
import esbuild from 'esbuild';
2+
import watPlugin from 'esbuild-plugin-wat';
3+
import { compressionBrowserPlugin } from './esbuild-plugins.mjs';
44
// esbuild has TypeScript support by default
55
const baseConfig = {
66
bundle: true,
@@ -10,11 +10,11 @@ const baseConfig = {
1010
'process.env.NODE_ENV': '"production"',
1111
global: 'window',
1212
},
13-
inject: ['./esbuild-shims.js'],
13+
inject: ['./esbuild-shims.mjs'],
1414
minify: true,
1515
mainFields: ['browser', 'module', 'main'],
1616
platform: 'browser', // default
17-
plugins: [compressionBrowserPlugin, wasmPlugin],
17+
plugins: [compressionBrowserPlugin, watPlugin()],
1818
target: 'es2020', // default
1919
};
2020
// configuration for generating test code in browser
@@ -26,39 +26,53 @@ const testConfig = {
2626
'process.env.NODE_ENV': '"production"',
2727
global: 'window',
2828
},
29-
inject: ['./esbuild-shims.js'],
29+
inject: ['./esbuild-shims.mjs'],
3030
minify: false,
3131
mainFields: ['browser', 'module', 'main'],
3232
platform: 'browser', // default
33-
plugins: [compressionBrowserPlugin, wasmPlugin],
33+
plugins: [compressionBrowserPlugin, watPlugin()],
3434
target: 'es2020', // default
3535
};
3636
const targets = [
3737
{
3838
...baseConfig,
3939
globalName: 'parquetjs',
40-
outdir: path.resolve(__dirname, 'dist', 'browser'),
40+
outdir: './dist/browser',
4141
},
4242
{
4343
...baseConfig,
4444
format: 'esm',
45-
outfile: path.resolve(__dirname, 'dist', 'browser', 'parquet.esm.js'),
45+
outfile: 'dist/browser/parquet.esm.js',
4646
},
4747
{
4848
...baseConfig,
4949
format: 'cjs',
50-
outfile: path.resolve(__dirname, 'dist', 'browser', 'parquet.cjs.js'),
50+
outfile: 'dist/browser/parquet.cjs.js',
5151
},
52-
// Browser test code below
52+
];
53+
54+
// Browser test code below is only in ESM
55+
const testTargets = [
5356
{
5457
...testConfig,
55-
outfile: path.resolve(__dirname, 'test', 'browser', 'main.js'),
58+
format: 'esm',
59+
mainFields: ['module', 'main'],
60+
outfile: 'test/browser/main.js',
5661
},
5762
];
63+
5864
Promise.all(targets.map(esbuild.build))
5965
.then((results) => {
6066
if (results.reduce((m, r) => m && !r.warnings.length, true)) {
61-
console.log('built with no errors or warnings');
67+
console.log('built dist targets with no errors or warnings');
68+
}
69+
})
70+
.then(() => {
71+
return Promise.all(testTargets.map(esbuild.build));
72+
})
73+
.then((results) => {
74+
if (results.reduce((m, r) => m && !r.warnings.length, true)) {
75+
console.log('built test targets with no errors or warnings');
6276
}
6377
})
6478
.catch((e) => {

eslint.config.mjs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ export default tseslint.config(
1010
mochaPlugin.configs.flat.recommended,
1111
...tseslint.configs.strict,
1212
...tseslint.configs.stylistic,
13+
{
14+
ignores: ['dist/*', 'test/browser/main.js'],
15+
},
1316
{
1417
rules: {
1518
// TODO: Fix/ignore in tests and remove
Binary file not shown.
Binary file not shown.
Binary file not shown.

examples/server/views/parquetFiles.ejs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,31 +11,38 @@
1111
<script>
1212
const port = "<%= port %>"
1313
const readit = function (filename) {
14+
const output = document.getElementById('output');
1415
parquetjs.ParquetReader
1516
.openUrl(`http://localhost:${port}/files/${filename}`)
1617
.then(async (reader) => {
1718
let cursor = await reader.getCursor();
1819
1920
let record = null;
21+
output.innerHTML = "";
2022
while (record = await cursor.next()) {
23+
output.innerHTML += `\nRecord: ${JSON.stringify(record, (_, v) => typeof v === 'bigint' ? v.toString() : v, 2)}`;
2124
console.log("record", record);
2225
}
2326
return reader;
2427
})
2528
.then((reader) => reader.close())
2629
.catch((e) => {
30+
output.innerHTML += `\nERROR: ${e}`;
2731
console.log("error", e);
2832
})
2933
3034
}
3135
3236
const search = async function (filename, columnName, searchTerm) {
37+
const output = document.getElementById('output');
3338
const rdr = await parquetjs.ParquetReader.openUrl(`http://localhost:${port}/files/${filename}`)
3439
const bfs = await rdr.getBloomFiltersFor([columnName])
3540
if (!bfs[columnName]) return false
41+
output.innerHTML = "";
3642
Promise.all(bfs[columnName].map(async item => item.sbbf.check(searchTerm)))
3743
.then(values => {
3844
const res = values.some((el) => (el === true));
45+
output.innerHTML += `\nResponse "${searchTerm}": ${JSON.stringify(res, null, 2)}`;
3946
console.log("res: ", res)
4047
return res
4148
})
@@ -54,10 +61,16 @@
5461
<li><button onclick="readit('fruits-bloomfilter.parquet')" >with Bloom filter</button></li>
5562
<li><button onclick="readit('alltypes_dictionary.parquet')" >all types w/ dictionary</button></li>
5663
<li><button onclick="readit('fruits.parquet')" >With (unsupported) Brötli compression</button></li>
64+
<li><button onclick="readit('gzip-nation.impala.parquet')" >With gzip compression</button></li>
65+
<li><button onclick="readit('snappy-compressed.parquet')" >With snappy compression</button></li>
66+
<li><button onclick="readit('sample_brotli_compressed.parquet')" >With brotli compression</button></li>
5767
<li><button onclick="readit('list.parquet')" >With list elements</button></li>
5868
</ul>
5969
<h3>Search fruits-bloomfilter.parquet "name" column for</h3>
6070
<button onclick="search('fruits-bloomfilter.parquet', 'name', 'kiwi')">Search fruits-bloomfilter.parquet for "kiwi"</button>
6171
<button onclick="search('fruits-bloomfilter.parquet', 'name', 'xxx')">Search fruits-bloomfilter.parquet for "xxx"</button>
72+
73+
<h2>Output</h2>
74+
<pre id="output"></pre>
6275
</body>
6376
</html>

0 commit comments

Comments
 (0)