Skip to content

Instantly share code, notes, and snippets.

@robherley
Created April 25, 2023 21:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robherley/74404f4d5065b308624953c5cf801f88 to your computer and use it in GitHub Desktop.
Save robherley/74404f4d5065b308624953c5cf801f88 to your computer and use it in GitHub Desktop.
Efficiently reading files from big zips from Azure Blob

Efficiently reading files from big zips from Azure Blob

Run go run main.go to test:

2023/04/25 17:02:26.660597 blob size is 10592348960 bytes, which is ~ 10101 MB
2023/04/25 17:02:26.660668 reading 1024 bytes from azblob from offset 10592347936
2023/04/25 17:02:26.680128 reading 20 bytes from azblob from offset 10592348918
2023/04/25 17:02:26.695184 reading 56 bytes from azblob from offset 10592348862
2023/04/25 17:02:26.710803 reading 4096 bytes from azblob from offset 10592340479
2023/04/25 17:02:26.745270 reading 4096 bytes from azblob from offset 10592344575
2023/04/25 17:02:26.759124 reading 289 bytes from azblob from offset 10592348671
2023/04/25 17:02:26.775641 archive contains 102 files
2023/04/25 17:02:26.775685 found wanted file: files/hello.txt => 12 bytes
2023/04/25 17:02:26.775699 reading 30 bytes from azblob from offset 104880592
2023/04/25 17:02:26.792228 reading 12 bytes from azblob from offset 104880665
2023/04/25 17:02:26.808158 ----start content from blob
hello world

2023/04/25 17:02:26.808210 ----end content from blob

Despite this zip file being ~10GB in blob, we only need to download the specific portion of the file we're looking for (plus some extra bytes for metadata).

How?

By implementing a io.ReaderAt, we can leverage the power of Go's standard library zip package (archive/zip), specifically zip.NewReader.

Implementing this reader is pretty simple, since the Blob Streaming API already supports seeking bytes using an offset and content length:

func (b *BlobReader) ReadAt(p []byte, off int64) (n int, err error) {
	httpRange := blob.HTTPRange{
		Offset: off,
		Count:  int64(len(p)),
	}

	res, err := b.client.DownloadStream(context.Background(), &blob.DownloadStreamOptions{
		Range: httpRange,
	})
	if err != nil {
		return 0, err
	}
	defer res.Body.Close()

	return io.ReadFull(res.Body, p)
}

With the power of the ZIP's Central Directory, the standard library's zip.Reader can efficiently read small portions of the file to find the content we're looking for.

Setup

Make a zip with a bunch a large 10gb file, a bunch of 1MB files and a file we want to read:

mkdir -p files
# make a 10GB file
dd if=/dev/urandom of=files/large_file bs=1m count=10000
# make a bunch of 1MB files
for i in {1..100}; do
  dd if=/dev/urandom of="files/file${i}" bs=1m count=1
done
# make an individual file that we want to read
echo "hello world" > files/hello.txt
# zip them together (may take a bit)
zip archive.zip files/*

Since we read from /dev/urandom these don't compress well which is great for our test:

$ ls -lah archive.zip
-rw-r--r--  1 robherley  staff   9.9G Apr 25 12:21 archive.zip

This is uploaded to blob (with public access):

https://robstorage123.blob.core.windows.net/zip-test/archive.zip
package main
import (
"archive/zip"
"context"
"errors"
"io"
"log"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
)
func init() {
log.SetFlags(log.LstdFlags | log.Lmicroseconds)
}
const (
ArchiveURL = "https://robstorage123.blob.core.windows.net/zip-test/archive.zip"
FileWeWant = "files/hello.txt"
)
var _ io.ReaderAt = &BlobReader{}
// BlobReader is an abstraction over the azblob client that implements io.ReaderAt using the Stream API with HTTPRange
type BlobReader struct {
client *blob.Client
}
func NewBlobReader(client *blob.Client) *BlobReader {
return &BlobReader{
client: client,
}
}
func (b *BlobReader) ReadAt(p []byte, off int64) (n int, err error) {
httpRange := blob.HTTPRange{
Offset: off,
Count: int64(len(p)),
}
log.Println("reading", httpRange.Count, "bytes from azblob from offset", httpRange.Offset)
res, err := b.client.DownloadStream(context.Background(), &blob.DownloadStreamOptions{
Range: httpRange,
})
if err != nil {
return 0, err
}
defer res.Body.Close()
return io.ReadFull(res.Body, p)
}
// getFileFromReader returns a file matching fileName from a zip.Reader
func getFileFromReader(archive *zip.Reader, fileName string) (*zip.File, error) {
for _, file := range archive.File {
if file.Name == fileName {
return file, nil
}
}
return nil, errors.New("file not found")
}
// readContent reads the content of a zip.File to a byte slice
// in the "real" implementation this would be sent to the http client making the request
func readContent(file *zip.File) ([]byte, error) {
rc, err := file.Open()
if err != nil {
log.Fatal(err)
}
defer rc.Close()
// in the "real" implementation we can split up the size of these reads to something safer
content := make([]byte, file.UncompressedSize64)
_, err = rc.Read(content)
if err != nil {
log.Fatal(err)
}
return content, nil
}
func main() {
client, err := blob.NewClientWithNoCredential(ArchiveURL, nil)
if err != nil {
log.Fatalln(err)
}
blobProperties, err := client.GetProperties(context.Background(), nil)
if err != nil {
log.Fatalln(err)
}
log.Println("blob size is", *blobProperties.ContentLength, "bytes, which is ~", *blobProperties.ContentLength/1024/1024, "MB")
br := NewBlobReader(client)
blobArchive, err := zip.NewReader(br, *blobProperties.ContentLength)
if err != nil {
log.Fatalln(err)
}
log.Println("archive contains", len(blobArchive.File), "files")
file, err := getFileFromReader(blobArchive, FileWeWant)
if err != nil {
log.Fatalln(err)
}
log.Println("found wanted file:", file.Name, "=>", file.UncompressedSize64, "bytes")
contents, err := readContent(file)
if err != nil {
log.Fatalln(err)
}
log.Println("----start content from blob")
println(string(contents))
log.Println("----end content from blob")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment