Created
April 4, 2024 21:40
-
-
Save 0x77dev/d45b1ed4cc0d906d10a14ab1c5c8e00b to your computer and use it in GitHub Desktop.
Split a CSV file in Bun.js into 100MB chunks while preserving header and row integrity with memory efficiency and streaming.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bun | |
import { createReadStream, createWriteStream, stat } from 'fs' | |
import { basename } from 'path' | |
import { Transform } from 'stream' | |
import { promisify } from 'util' | |
const MAX_SIZE = 100 * 1024 * 1024 // 100MB | |
const splitFile = async (filePath: string): Promise<void> => { | |
const fileStream = createReadStream(filePath, 'utf-8') | |
let header = '' | |
let partIndex = 1 | |
let partSize = 0 | |
let partStream = createWriteStream(getPartFileName(filePath, partIndex)) | |
const splitStream = new Transform({ | |
transform(chunk, encoding, callback) { | |
const lines = chunk.toString().split('\n') | |
if (!header) { | |
header = lines[0] + '\n' | |
partStream.write(header) | |
partSize += Buffer.byteLength(header, 'utf-8') | |
lines.shift() | |
} | |
for (const line of lines) { | |
const lineSize = Buffer.byteLength(line, 'utf-8') + 1 // +1 for newline character | |
if (partSize + lineSize > MAX_SIZE) { | |
partStream.end() | |
partIndex++ | |
partStream = createWriteStream(getPartFileName(filePath, partIndex)) | |
partStream.write(header) | |
partSize = Buffer.byteLength(header, 'utf-8') | |
} | |
partStream.write(line + '\n') | |
partSize += lineSize | |
} | |
callback() | |
}, | |
flush(callback) { | |
partStream.end() | |
callback() | |
} | |
}) | |
await new Promise<void>((resolve, reject) => { | |
fileStream.pipe(splitStream).on('finish', () => { | |
resolve() | |
}).on('error', (err) => { | |
reject(err) | |
}) | |
}) | |
} | |
const getPartFileName = (filePath: string, partIndex: number): string => { | |
const fileName = basename(filePath, '.csv') | |
return `${fileName}_part${partIndex}.csv` | |
} | |
const filePaths = process.argv.slice(2) | |
await Promise.all( | |
filePaths.map(async filePath => { | |
try { | |
const stats = await promisify(stat)(filePath) | |
if (stats.isFile()) { | |
console.log(`Splitting file: ${filePath}`) | |
await splitFile(filePath) | |
console.log(filePath, 'done!') | |
} | |
} catch (err) { | |
console.error(`Error processing file: ${filePath}`, err) | |
} | |
}) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment