Last active
August 29, 2023 16:56
-
-
Save betelgeuse-7/e47b71b88b755f8d5d7759596f904072 to your computer and use it in GitHub Desktop.
Split a file into chunks.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// FILE=file.ext CHUNKS=4 go run split.go | |
// CHUNKS is optional (default is 4). | |
package main | |
import ( | |
"fmt" | |
"os" | |
"strconv" | |
"sync" | |
) | |
func Log(msg string, args ...any) { | |
fmt.Printf(msg + "\n", args...) | |
} | |
func LogAndExit(msg string, args ...any) { | |
Log(msg, args...) | |
os.Exit(1) | |
} | |
func main() { | |
fname := os.Getenv("FILE") | |
chunksEnv := os.Getenv("CHUNKS") | |
if fname == "" { | |
LogAndExit("A file is needed.") | |
} | |
var chunks int | |
if chunksEnv == "" { | |
Log("CHUNKS unspecified. Defaulting to 4...") | |
chunks = 4 | |
} else { | |
chnks, err := strconv.Atoi(chunksEnv) | |
if err != nil { | |
LogAndExit("Erroneous CHUNK variable: %s\n", err.Error()) | |
} | |
chunks = chnks | |
} | |
f, err := os.Open(fname) | |
if err != nil { | |
LogAndExit("Error: %s\n", err.Error()) | |
} | |
finfo, err := f.Stat() | |
if err != nil { | |
LogAndExit("Stat error: %s\n", err.Error()) | |
} | |
size := int(finfo.Size()) | |
sizeModChunks := size%chunks | |
isEvenlyDivisible := sizeModChunks == 0 | |
var spillNumber int | |
if !(isEvenlyDivisible) { | |
spillNumber = sizeModChunks | |
} | |
Log("file %s size %d chunks %d spill %d\n", fname, size, chunks, spillNumber) | |
eachChunkSize := (size-sizeModChunks) / chunks | |
lastChunkSize := eachChunkSize + spillNumber | |
Log("CHUNKS") | |
for i := 0; i < chunks; i++ { | |
chnkNum := eachChunkSize | |
if i == chunks-1 { chnkNum = lastChunkSize } | |
Log(" #%d %dB", i+1, chnkNum) | |
} | |
var wg sync.WaitGroup | |
var totalBytesRead int | |
for i := 0; i < chunks; i++ { | |
byteCount := eachChunkSize | |
if i == chunks-1 { | |
byteCount = lastChunkSize | |
} | |
Log("\n>> Current offset %d", eachChunkSize*i) | |
bx, err := readBytes(f, byteCount, eachChunkSize*i) | |
if err != nil { | |
LogAndExit("Error while reading bytes: %s", err.Error()) | |
} | |
totalBytesRead += byteCount | |
wg.Add(1) | |
// if we use 'i' in fmt.Sprintf below, we'd not get what we expect. In the case where chunks=2, | |
// we always get ..._Chunk_2.bytes, because the loop ends before the goroutine even launches. | |
// I am not sure if it is actually related to the fact that Go doesn't create a new copy of the | |
// loop variable at each iteration or not (It uses the same address. This will change IIRC.). | |
// | |
// Anyways. | |
currentIdx := i | |
go func() { | |
defer wg.Done() | |
chunkName := fmt.Sprintf("%d_Chunk_%s.bytes", currentIdx+1, f.Name()) | |
if err := writeChunk(bx, chunkName); err != nil { | |
LogAndExit("Error while writing chunk %s: %s", chunkName, err.Error()) | |
} | |
}() | |
} | |
wg.Wait() | |
Log("\nFinished job.") | |
} | |
func readBytes(f *os.File, howMany int, idx int) ([]byte, error) { | |
var bx = make([]byte, howMany) | |
n, err := f.ReadAt(bx, int64(idx)) | |
Log(" Number of bytes read %d", n) | |
return bx, err | |
} | |
func writeChunk(bx []byte, chunkName string) error { | |
return os.WriteFile(chunkName, bx, 0666) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment