Skip to content

Instantly share code, notes, and snippets.

@moonorange
Last active March 28, 2024 01:49
Show Gist options
  • Save moonorange/d09b6a0cd7f265b9a8d1ee5ddadd18e0 to your computer and use it in GitHub Desktop.
Save moonorange/d09b6a0cd7f265b9a8d1ee5ddadd18e0 to your computer and use it in GitHub Desktop.
Go script to generate a large CSV file
package large_csv_generator
import (
"log"
"os"
"testing"
)
const (
totalNumRows = 100000000
fileName = "test_data"
)
func BenchmarkGenerateLargeCSVParallel(b *testing.B) {
GenerateLargeCSVParallel(totalNumRows/10, 10, fileName)
defer CleanUp()
}
func BenchmarkGenerateLargeCSV(b *testing.B) {
GenerateLargeCSV(totalNumRows, fileName)
defer CleanUp()
}
func BenchmarkGenerateLargeCSVParallelToOneFile(b *testing.B) {
GenerateLargeCSVParallelToOneFile(totalNumRows/10, 10, fileName)
defer CleanUp()
}
func CleanUp() {
err := os.RemoveAll("data")
if err != nil {
log.Fatal(err)
}
}
package main
import (
"encoding/csv"
"errors"
"fmt"
"os"
"sync"
"time"
"github.com/brianvoe/gofakeit"
)
const (
numRowsTotal = 1000
numGoroutines = 1
numRowsPerFile = numRowsTotal / numGoroutines
)
func main() {
GenerateLargeCSVParallel(numRowsPerFile, numGoroutines, "test_data")
}
// GenerateLargeCSV generates a CSV file with numRows rows
func GenerateLargeCSV(numRows int, fileName string) {
err := os.Mkdir("data", 0777)
if err != nil {
if !errors.Is(err, os.ErrExist) {
panic(err)
}
}
file, err := os.Create(fmt.Sprintf("data/%s.csv", fileName))
if err != nil {
panic(err)
}
defer file.Close()
writer := csv.NewWriter(file)
for i := 0; i < numRows; i++ {
row := generateFakeRow()
if err := writer.Write(row); err != nil {
panic(err)
}
}
writer.Flush()
}
func generateFakeRow() []string {
// 1 year ago
startDate := time.Now().AddDate(-1, 0, 0)
endDate := time.Now()
return []string{
gofakeit.UUID(),
fmt.Sprintf("%d", gofakeit.Uint8()),
gofakeit.DateRange(startDate, endDate).Format(time.DateTime),
gofakeit.DateRange(startDate, endDate).Format(time.DateTime),
}
}
// Parallelize CSV generation
func GenerateLargeCSVParallel(numRows, numGoroutines int, fileName string) {
var wg sync.WaitGroup
// Add numGoroutines to the WaitGroup
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
// Call GenerateLargeCSV in a goroutine for numGoroutines times
go func(wg *sync.WaitGroup, i int) {
fileName := fmt.Sprintf("%s_%d", fileName, i)
GenerateLargeCSV(numRows, fileName)
// Decrement the WaitGroup counter after each goroutine finishes
defer wg.Done()
}(&wg, i)
}
// Wait for all goroutines to finish
wg.Wait()
fmt.Printf("Done GenerateLargeCSVParallel")
}
package main
import (
"encoding/csv"
"errors"
"fmt"
"io"
"os"
"sync"
"time"
"github.com/brianvoe/gofakeit"
)
const (
numRowsTotal = 1000
numGoroutines = 1
numRowsPerFile = numRowsTotal / numGoroutines
)
func main() {
GenerateLargeCSVParallelToOneFile(numRowsPerFile, numGoroutines, "test_data")
}
func generateFakeRow() []string {
// 1 year ago
startDate := time.Now().AddDate(-1, 0, 0)
endDate := time.Now()
return []string{
gofakeit.UUID(),
fmt.Sprintf("%d", gofakeit.Uint8()),
gofakeit.DateRange(startDate, endDate).Format(time.DateTime),
gofakeit.DateRange(startDate, endDate).Format(time.DateTime),
}
}
// Parallelize CSV generation while writing the same file
func GenerateLargeCSVParallelToOneFile(numRows, numGoroutines int, fileName string) {
err := os.Mkdir("data", 0777)
if err != nil {
if !errors.Is(err, os.ErrExist) {
panic(err)
}
}
file, err := os.Create(fmt.Sprintf("data/%s.csv", fileName))
if err != nil {
if !errors.Is(err, os.ErrExist) {
panic(err)
}
}
defer file.Close()
var wg sync.WaitGroup
// Add numGoroutines to the WaitGroup
wg.Add(numGoroutines)
writer, err := NewCSVWriter(file)
if err != nil {
panic(err)
}
for i := 0; i < numGoroutines; i++ {
// Call GenerateLargeCSV in a goroutine for numGoroutines times
go func(wg *sync.WaitGroup, i int, writer *CsvWriter) {
GenerateLargeCSVWithLock(numRows, writer)
// Decrement the WaitGroup counter after each goroutine finishes
defer wg.Done()
}(&wg, i, writer)
}
// Wait for all goroutines to finish
wg.Wait()
fmt.Printf("Done GenerateLargeCSVParallelToOneFile")
}
func GenerateLargeCSVWithLock(numRows int, writer *CsvWriter) {
for i := 0; i < numRows; i++ {
row := generateFakeRow()
if err := writer.Write(row); err != nil {
panic(err)
}
}
writer.Flush()
}
// thread safe csv writer
type CsvWriter struct {
mutex *sync.Mutex
csvWriter *csv.Writer
}
func NewCSVWriter(file io.Writer) (*CsvWriter, error) {
w := csv.NewWriter(file)
return &CsvWriter{csvWriter: w, mutex: &sync.Mutex{}}, nil
}
// lock and write
func (w *CsvWriter) Write(row []string) error {
w.mutex.Lock()
defer w.mutex.Unlock()
err := w.csvWriter.Write(row)
if err != nil {
return err
}
return nil
}
// lock and flush
func (w *CsvWriter) Flush() {
w.mutex.Lock()
w.csvWriter.Flush()
w.mutex.Unlock()
}
package main
import (
"encoding/csv"
"errors"
"fmt"
"os"
"sync"
"time"
"github.com/brianvoe/gofakeit"
)
const (
numRowsTotal = 1000
numGoroutines = 1
numRowsPerFile = numRowsTotal / numGoroutines
)
func main() {
GenerateLargeCSV(numRowsTotal, "test_data")
}
// GenerateLargeCSV generates a CSV file with numRows rows
func GenerateLargeCSV(numRows int, fileName string) {
err := os.Mkdir("data", 0777)
if err != nil {
if !errors.Is(err, os.ErrExist) {
panic(err)
}
}
file, err := os.Create(fmt.Sprintf("data/%s.csv", fileName))
if err != nil {
panic(err)
}
defer file.Close()
writer := csv.NewWriter(file)
for i := 0; i < numRows; i++ {
row := generateFakeRow()
if err := writer.Write(row); err != nil {
panic(err)
}
}
writer.Flush()
}
func generateFakeRow() []string {
// 1 year ago
startDate := time.Now().AddDate(-1, 0, 0)
endDate := time.Now()
return []string{
gofakeit.UUID(),
fmt.Sprintf("%d", gofakeit.Uint8()),
gofakeit.DateRange(startDate, endDate).Format(time.DateTime),
gofakeit.DateRange(startDate, endDate).Format(time.DateTime),
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment