Skip to content

Instantly share code, notes, and snippets.

@0x77dev
Created April 4, 2024 21:40
Show Gist options
  • Save 0x77dev/d45b1ed4cc0d906d10a14ab1c5c8e00b to your computer and use it in GitHub Desktop.
Save 0x77dev/d45b1ed4cc0d906d10a14ab1c5c8e00b to your computer and use it in GitHub Desktop.
Split a CSV file in Bun.js into 100MB chunks while preserving header and row integrity with memory efficiency and streaming.
#!/usr/bin/env bun
import { createReadStream, createWriteStream, stat } from 'fs'
import { basename } from 'path'
import { Transform } from 'stream'
import { promisify } from 'util'
const MAX_SIZE = 100 * 1024 * 1024 // 100MB
const splitFile = async (filePath: string): Promise<void> => {
const fileStream = createReadStream(filePath, 'utf-8')
let header = ''
let partIndex = 1
let partSize = 0
let partStream = createWriteStream(getPartFileName(filePath, partIndex))
const splitStream = new Transform({
transform(chunk, encoding, callback) {
const lines = chunk.toString().split('\n')
if (!header) {
header = lines[0] + '\n'
partStream.write(header)
partSize += Buffer.byteLength(header, 'utf-8')
lines.shift()
}
for (const line of lines) {
const lineSize = Buffer.byteLength(line, 'utf-8') + 1 // +1 for newline character
if (partSize + lineSize > MAX_SIZE) {
partStream.end()
partIndex++
partStream = createWriteStream(getPartFileName(filePath, partIndex))
partStream.write(header)
partSize = Buffer.byteLength(header, 'utf-8')
}
partStream.write(line + '\n')
partSize += lineSize
}
callback()
},
flush(callback) {
partStream.end()
callback()
}
})
await new Promise<void>((resolve, reject) => {
fileStream.pipe(splitStream).on('finish', () => {
resolve()
}).on('error', (err) => {
reject(err)
})
})
}
const getPartFileName = (filePath: string, partIndex: number): string => {
const fileName = basename(filePath, '.csv')
return `${fileName}_part${partIndex}.csv`
}
const filePaths = process.argv.slice(2)
await Promise.all(
filePaths.map(async filePath => {
try {
const stats = await promisify(stat)(filePath)
if (stats.isFile()) {
console.log(`Splitting file: ${filePath}`)
await splitFile(filePath)
console.log(filePath, 'done!')
}
} catch (err) {
console.error(`Error processing file: ${filePath}`, err)
}
})
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment