Skip to content

Instantly share code, notes, and snippets.

@c4pt0r
Created April 13, 2023 00:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save c4pt0r/74c708ec7831ac349822e528a4a6efa9 to your computer and use it in GitHub Desktop.
Save c4pt0r/74c708ec7831ac349822e528a4a6efa9 to your computer and use it in GitHub Desktop.
Small tool to split large CSV file
package main
import (
"bufio"
"bytes"
"compress/gzip"
"encoding/base64"
"encoding/csv"
"flag"
"fmt"
"io"
"os"
"path"
"github.com/c4pt0r/log"
)
var (
csvFile = flag.String("i", "", "CSV file to split")
sizeLimit = flag.Int64("s", 10000000, "Size limit of each partial CSV file in bytes, default: 10000000 (10MB)")
outDir = flag.String("o", "./output", "Output directory")
hasHeader = flag.Bool("has-header", true, "CSV file has header row")
)
func mustCreateDir(dir string) {
err := os.MkdirAll(dir, 0755)
if err != nil && !os.IsExist(err) {
log.Fatalf("Failed to create directory: %s, error: %v", dir, err)
}
}
// splitCSVFile splits the specified CSV file into several smaller CSV files with a size limit specified in bytes
func splitCSVFile(originalCSVfile string, sizeLimit int64, outDir string, withHeader bool) ([]string, error) {
// Open the original CSV file
f, err := os.Open(originalCSVfile)
if err != nil {
return nil, err
}
defer f.Close()
// Create a CSV reader
r := csv.NewReader(bufio.NewReader(f))
// Read the header row of the CSV file
var header []string
if withHeader {
header, err = r.Read()
if err != nil {
return nil, err
}
}
// Create a CSV writer
var (
createdFiles []string
currentCSVfile *os.File
currentCSVwriter *csv.Writer
currentCSVfileIndex int
currentCSVfileSize int64
rowCount int64
)
calcRecordSize := func(row []string) int64 {
size := 0
for _, field := range row {
size += len(field)
}
return int64(size)
}
openNewCSVfile := func() (string, error) {
// Close the current CSV file
if currentCSVwriter != nil {
currentCSVwriter.Flush()
currentCSVfile.Close()
}
// Create a new CSV file
baseName := path.Base(originalCSVfile)
filename := path.Join(outDir, fmt.Sprintf("%s_%d.csv", baseName, currentCSVfileIndex))
currentCSVfile, err = os.Create(filename)
if err != nil {
return "", err
}
// Create a new CSV writer
currentCSVwriter = csv.NewWriter(currentCSVfile)
if withHeader {
// Write the header row of the CSV file
err = currentCSVwriter.Write(header)
if err != nil {
return "", err
}
}
// Update the CSV file index
currentCSVfileIndex++
createdFiles = append(createdFiles, filename)
return filename, nil
}
// for the first one
_, err = openNewCSVfile()
if err != nil {
return nil, err
}
// Split the CSV file
for {
// Read a record from the CSV file
record, err := r.Read()
if err == io.EOF {
break
} else if err != nil {
return nil, err
}
// Open a new CSV file if the current CSV file size exceeds the limit
recordSize := calcRecordSize(record) // Calculate the size of the CSV record in bytes
if currentCSVfileSize+recordSize > sizeLimit {
fn, err := openNewCSVfile()
if err != nil {
return nil, err
}
currentCSVfileSize = 0
log.Infof("Created new partial CSV file: %s, started row:%d", fn, rowCount)
}
// Write the CSV record to the current CSV file
err = currentCSVwriter.Write(record)
if err != nil {
return nil, err
}
// Update the size of the current CSV file and row count
currentCSVfileSize += recordSize
rowCount++
}
// Close the last CSV file
if currentCSVwriter != nil {
currentCSVwriter.Flush()
currentCSVfile.Close()
}
return createdFiles, nil
}
func main() {
flag.Parse()
// Create the output directory
mustCreateDir(*outDir)
// Split the CSV file
if len(*csvFile) == 0 {
log.Fatal("Please specify a CSV file")
}
files, err := splitCSVFile(*csvFile, *sizeLimit, *outDir, *hasHeader)
if err != nil {
log.Fatalf("Failed to split CSV file: %s, error: %v", *csvFile, err)
}
for _, f := range files {
fmt.Println(f)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment