Skip to content

Instantly share code, notes, and snippets.

@mhkeller
Last active March 26, 2024 18:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mhkeller/855ca5c0a6582e4ead7d36e6f8169fdd to your computer and use it in GitHub Desktop.
Save mhkeller/855ca5c0a6582e4ead7d36e6f8169fdd to your computer and use it in GitHub Desktop.
Convert a parquet file to bytes representing an arrow table
{
"name": "parquet-to-arrow",
"version": "1.0.0",
"description": "",
"scripts": {
},
"dependencies": {
"apache-arrow": "^15.0.2",
"arrow-js-ffi": "^0.4.1",
"parquet-wasm": "^0.6.0-beta.2"
}
}
/**
* Converts a parquet file on disk to an arrow file in memory
* Adapted from the README example in https://github.com/kylebarron/parquet-wasm?tab=readme-ov-file
*
* @param {string} filepath
* @returns {buffer}
*/
const arrow = require("apache-arrow");
const { parseRecordBatch } = require("arrow-js-ffi");
const { readFileSync } = require("fs");
const { readParquet, wasmMemory } = require("parquet-wasm");
module.exports = async function parquetToArrow(filepath) {
// A reference to the WebAssembly memory object.
const WASM_MEMORY = wasmMemory();
const buf = readFileSync(filepath);
const parquetUint8Array = new Uint8Array(Buffer.from(buf));
const wasmArrowTable = readParquet(parquetUint8Array).intoFFI();
const recordBatches = [];
for (let i = 0; i < wasmArrowTable.numBatches(); i++) {
// Note: Unless you know what you're doing, setting `true` below is recommended to _copy_
// table data from WebAssembly into JavaScript memory. This may become the default in the
// future.
const recordBatch = parseRecordBatch(
WASM_MEMORY.buffer,
wasmArrowTable.arrayAddr(i),
wasmArrowTable.schemaAddr(),
true
);
recordBatches.push(recordBatch);
}
const table = new arrow.Table(recordBatches);
// Skip this step converting it to bytes if you just want the table
const ipcStream = arrow.tableToIPC(table, 'stream');
const bytes = Buffer.from(ipcStream, 'utf-8');
// VERY IMPORTANT! You must call `drop` on the Wasm table object when you're done using it
// to release the Wasm memory.
// Note that any access to the pointers in this table is undefined behavior after this call.
// Calling any `wasmArrowTable` method will error.
wasmArrowTable.drop();
return bytes;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment