Last active
March 28, 2024 01:49
-
-
Save moonorange/d09b6a0cd7f265b9a8d1ee5ddadd18e0 to your computer and use it in GitHub Desktop.
Go script to generate a large CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package large_csv_generator | |
import ( | |
"log" | |
"os" | |
"testing" | |
) | |
const ( | |
totalNumRows = 100000000 | |
fileName = "test_data" | |
) | |
func BenchmarkGenerateLargeCSVParallel(b *testing.B) { | |
GenerateLargeCSVParallel(totalNumRows/10, 10, fileName) | |
defer CleanUp() | |
} | |
func BenchmarkGenerateLargeCSV(b *testing.B) { | |
GenerateLargeCSV(totalNumRows, fileName) | |
defer CleanUp() | |
} | |
func BenchmarkGenerateLargeCSVParallelToOneFile(b *testing.B) { | |
GenerateLargeCSVParallelToOneFile(totalNumRows/10, 10, fileName) | |
defer CleanUp() | |
} | |
func CleanUp() { | |
err := os.RemoveAll("data") | |
if err != nil { | |
log.Fatal(err) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"errors" | |
"fmt" | |
"os" | |
"sync" | |
"time" | |
"github.com/brianvoe/gofakeit" | |
) | |
const ( | |
numRowsTotal = 1000 | |
numGoroutines = 1 | |
numRowsPerFile = numRowsTotal / numGoroutines | |
) | |
func main() { | |
GenerateLargeCSVParallel(numRowsPerFile, numGoroutines, "test_data") | |
} | |
// GenerateLargeCSV generates a CSV file with numRows rows | |
func GenerateLargeCSV(numRows int, fileName string) { | |
err := os.Mkdir("data", 0777) | |
if err != nil { | |
if !errors.Is(err, os.ErrExist) { | |
panic(err) | |
} | |
} | |
file, err := os.Create(fmt.Sprintf("data/%s.csv", fileName)) | |
if err != nil { | |
panic(err) | |
} | |
defer file.Close() | |
writer := csv.NewWriter(file) | |
for i := 0; i < numRows; i++ { | |
row := generateFakeRow() | |
if err := writer.Write(row); err != nil { | |
panic(err) | |
} | |
} | |
writer.Flush() | |
} | |
func generateFakeRow() []string { | |
// 1 year ago | |
startDate := time.Now().AddDate(-1, 0, 0) | |
endDate := time.Now() | |
return []string{ | |
gofakeit.UUID(), | |
fmt.Sprintf("%d", gofakeit.Uint8()), | |
gofakeit.DateRange(startDate, endDate).Format(time.DateTime), | |
gofakeit.DateRange(startDate, endDate).Format(time.DateTime), | |
} | |
} | |
// Parallelize CSV generation | |
func GenerateLargeCSVParallel(numRows, numGoroutines int, fileName string) { | |
var wg sync.WaitGroup | |
// Add numGoroutines to the WaitGroup | |
wg.Add(numGoroutines) | |
for i := 0; i < numGoroutines; i++ { | |
// Call GenerateLargeCSV in a goroutine for numGoroutines times | |
go func(wg *sync.WaitGroup, i int) { | |
fileName := fmt.Sprintf("%s_%d", fileName, i) | |
GenerateLargeCSV(numRows, fileName) | |
// Decrement the WaitGroup counter after each goroutine finishes | |
defer wg.Done() | |
}(&wg, i) | |
} | |
// Wait for all goroutines to finish | |
wg.Wait() | |
fmt.Printf("Done GenerateLargeCSVParallel") | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"errors" | |
"fmt" | |
"io" | |
"os" | |
"sync" | |
"time" | |
"github.com/brianvoe/gofakeit" | |
) | |
const ( | |
numRowsTotal = 1000 | |
numGoroutines = 1 | |
numRowsPerFile = numRowsTotal / numGoroutines | |
) | |
func main() { | |
GenerateLargeCSVParallelToOneFile(numRowsPerFile, numGoroutines, "test_data") | |
} | |
func generateFakeRow() []string { | |
// 1 year ago | |
startDate := time.Now().AddDate(-1, 0, 0) | |
endDate := time.Now() | |
return []string{ | |
gofakeit.UUID(), | |
fmt.Sprintf("%d", gofakeit.Uint8()), | |
gofakeit.DateRange(startDate, endDate).Format(time.DateTime), | |
gofakeit.DateRange(startDate, endDate).Format(time.DateTime), | |
} | |
} | |
// Parallelize CSV generation while writing the same file | |
func GenerateLargeCSVParallelToOneFile(numRows, numGoroutines int, fileName string) { | |
err := os.Mkdir("data", 0777) | |
if err != nil { | |
if !errors.Is(err, os.ErrExist) { | |
panic(err) | |
} | |
} | |
file, err := os.Create(fmt.Sprintf("data/%s.csv", fileName)) | |
if err != nil { | |
if !errors.Is(err, os.ErrExist) { | |
panic(err) | |
} | |
} | |
defer file.Close() | |
var wg sync.WaitGroup | |
// Add numGoroutines to the WaitGroup | |
wg.Add(numGoroutines) | |
writer, err := NewCSVWriter(file) | |
if err != nil { | |
panic(err) | |
} | |
for i := 0; i < numGoroutines; i++ { | |
// Call GenerateLargeCSV in a goroutine for numGoroutines times | |
go func(wg *sync.WaitGroup, i int, writer *CsvWriter) { | |
GenerateLargeCSVWithLock(numRows, writer) | |
// Decrement the WaitGroup counter after each goroutine finishes | |
defer wg.Done() | |
}(&wg, i, writer) | |
} | |
// Wait for all goroutines to finish | |
wg.Wait() | |
fmt.Printf("Done GenerateLargeCSVParallelToOneFile") | |
} | |
func GenerateLargeCSVWithLock(numRows int, writer *CsvWriter) { | |
for i := 0; i < numRows; i++ { | |
row := generateFakeRow() | |
if err := writer.Write(row); err != nil { | |
panic(err) | |
} | |
} | |
writer.Flush() | |
} | |
// thread safe csv writer | |
type CsvWriter struct { | |
mutex *sync.Mutex | |
csvWriter *csv.Writer | |
} | |
func NewCSVWriter(file io.Writer) (*CsvWriter, error) { | |
w := csv.NewWriter(file) | |
return &CsvWriter{csvWriter: w, mutex: &sync.Mutex{}}, nil | |
} | |
// lock and write | |
func (w *CsvWriter) Write(row []string) error { | |
w.mutex.Lock() | |
defer w.mutex.Unlock() | |
err := w.csvWriter.Write(row) | |
if err != nil { | |
return err | |
} | |
return nil | |
} | |
// lock and flush | |
func (w *CsvWriter) Flush() { | |
w.mutex.Lock() | |
w.csvWriter.Flush() | |
w.mutex.Unlock() | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"errors" | |
"fmt" | |
"os" | |
"sync" | |
"time" | |
"github.com/brianvoe/gofakeit" | |
) | |
const ( | |
numRowsTotal = 1000 | |
numGoroutines = 1 | |
numRowsPerFile = numRowsTotal / numGoroutines | |
) | |
func main() { | |
GenerateLargeCSV(numRowsTotal, "test_data") | |
} | |
// GenerateLargeCSV generates a CSV file with numRows rows | |
func GenerateLargeCSV(numRows int, fileName string) { | |
err := os.Mkdir("data", 0777) | |
if err != nil { | |
if !errors.Is(err, os.ErrExist) { | |
panic(err) | |
} | |
} | |
file, err := os.Create(fmt.Sprintf("data/%s.csv", fileName)) | |
if err != nil { | |
panic(err) | |
} | |
defer file.Close() | |
writer := csv.NewWriter(file) | |
for i := 0; i < numRows; i++ { | |
row := generateFakeRow() | |
if err := writer.Write(row); err != nil { | |
panic(err) | |
} | |
} | |
writer.Flush() | |
} | |
func generateFakeRow() []string { | |
// 1 year ago | |
startDate := time.Now().AddDate(-1, 0, 0) | |
endDate := time.Now() | |
return []string{ | |
gofakeit.UUID(), | |
fmt.Sprintf("%d", gofakeit.Uint8()), | |
gofakeit.DateRange(startDate, endDate).Format(time.DateTime), | |
gofakeit.DateRange(startDate, endDate).Format(time.DateTime), | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment