Skip to content

Instantly share code, notes, and snippets.

@betelgeuse-7
Last active August 29, 2023 16:56
Show Gist options
  • Save betelgeuse-7/e47b71b88b755f8d5d7759596f904072 to your computer and use it in GitHub Desktop.
Save betelgeuse-7/e47b71b88b755f8d5d7759596f904072 to your computer and use it in GitHub Desktop.
Split a file into chunks.
// FILE=file.ext CHUNKS=4 go run split.go
// CHUNKS is optional (default is 4).
package main
import (
"fmt"
"os"
"strconv"
"sync"
)
func Log(msg string, args ...any) {
fmt.Printf(msg + "\n", args...)
}
func LogAndExit(msg string, args ...any) {
Log(msg, args...)
os.Exit(1)
}
func main() {
fname := os.Getenv("FILE")
chunksEnv := os.Getenv("CHUNKS")
if fname == "" {
LogAndExit("A file is needed.")
}
var chunks int
if chunksEnv == "" {
Log("CHUNKS unspecified. Defaulting to 4...")
chunks = 4
} else {
chnks, err := strconv.Atoi(chunksEnv)
if err != nil {
LogAndExit("Erroneous CHUNK variable: %s\n", err.Error())
}
chunks = chnks
}
f, err := os.Open(fname)
if err != nil {
LogAndExit("Error: %s\n", err.Error())
}
finfo, err := f.Stat()
if err != nil {
LogAndExit("Stat error: %s\n", err.Error())
}
size := int(finfo.Size())
sizeModChunks := size%chunks
isEvenlyDivisible := sizeModChunks == 0
var spillNumber int
if !(isEvenlyDivisible) {
spillNumber = sizeModChunks
}
Log("file %s size %d chunks %d spill %d\n", fname, size, chunks, spillNumber)
eachChunkSize := (size-sizeModChunks) / chunks
lastChunkSize := eachChunkSize + spillNumber
Log("CHUNKS")
for i := 0; i < chunks; i++ {
chnkNum := eachChunkSize
if i == chunks-1 { chnkNum = lastChunkSize }
Log(" #%d %dB", i+1, chnkNum)
}
var wg sync.WaitGroup
var totalBytesRead int
for i := 0; i < chunks; i++ {
byteCount := eachChunkSize
if i == chunks-1 {
byteCount = lastChunkSize
}
Log("\n>> Current offset %d", eachChunkSize*i)
bx, err := readBytes(f, byteCount, eachChunkSize*i)
if err != nil {
LogAndExit("Error while reading bytes: %s", err.Error())
}
totalBytesRead += byteCount
wg.Add(1)
// if we use 'i' in fmt.Sprintf below, we'd not get what we expect. In the case where chunks=2,
// we always get ..._Chunk_2.bytes, because the loop ends before the goroutine even launches.
// I am not sure if it is actually related to the fact that Go doesn't create a new copy of the
// loop variable at each iteration or not (It uses the same address. This will change IIRC.).
//
// Anyways.
currentIdx := i
go func() {
defer wg.Done()
chunkName := fmt.Sprintf("%d_Chunk_%s.bytes", currentIdx+1, f.Name())
if err := writeChunk(bx, chunkName); err != nil {
LogAndExit("Error while writing chunk %s: %s", chunkName, err.Error())
}
}()
}
wg.Wait()
Log("\nFinished job.")
}
func readBytes(f *os.File, howMany int, idx int) ([]byte, error) {
var bx = make([]byte, howMany)
n, err := f.ReadAt(bx, int64(idx))
Log(" Number of bytes read %d", n)
return bx, err
}
func writeChunk(bx []byte, chunkName string) error {
return os.WriteFile(chunkName, bx, 0666)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment