robherley/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Efficiently reading files from big zips from Azure Blob

Run go run main.go to test:
2023/04/25 17:02:26.660597 blob size is 10592348960 bytes, which is ~ 10101 MB
2023/04/25 17:02:26.660668 reading 1024 bytes from azblob from offset 10592347936
2023/04/25 17:02:26.680128 reading 20 bytes from azblob from offset 10592348918
2023/04/25 17:02:26.695184 reading 56 bytes from azblob from offset 10592348862
2023/04/25 17:02:26.710803 reading 4096 bytes from azblob from offset 10592340479
2023/04/25 17:02:26.745270 reading 4096 bytes from azblob from offset 10592344575
2023/04/25 17:02:26.759124 reading 289 bytes from azblob from offset 10592348671
2023/04/25 17:02:26.775641 archive contains 102 files
2023/04/25 17:02:26.775685 found wanted file: files/hello.txt => 12 bytes
2023/04/25 17:02:26.775699 reading 30 bytes from azblob from offset 104880592
2023/04/25 17:02:26.792228 reading 12 bytes from azblob from offset 104880665
2023/04/25 17:02:26.808158 ----start content from blob
hello world

2023/04/25 17:02:26.808210 ----end content from blob

Despite this zip file being ~10GB in blob, we only need to download the specific portion of the file we're looking for (plus some extra bytes for metadata).
How?

By implementing a io.ReaderAt, we can leverage the power of Go's standard library zip package (archive/zip), specifically zip.NewReader.
Implementing this reader is pretty simple, since the Blob Streaming API already supports seeking bytes using an offset and content length:
func (b *BlobReader) ReadAt(p []byte, off int64) (n int, err error) {
	httpRange := blob.HTTPRange{
		Offset: off,
		Count:  int64(len(p)),
	}

	res, err := b.client.DownloadStream(context.Background(), &blob.DownloadStreamOptions{
		Range: httpRange,
	})
	if err != nil {
		return 0, err
	}
	defer res.Body.Close()

	return io.ReadFull(res.Body, p)
}
With the power of the ZIP's Central Directory, the standard library's zip.Reader can efficiently read small portions of the file to find the content we're looking for.
Setup

Make a zip with a bunch a large 10gb file, a bunch of 1MB files and a file we want to read:
mkdir -p files
# make a 10GB file
dd if=/dev/urandom of=files/large_file bs=1m count=10000
# make a bunch of 1MB files
for i in {1..100}; do
  dd if=/dev/urandom of="files/file${i}" bs=1m count=1
done
# make an individual file that we want to read
echo "hello world" > files/hello.txt
# zip them together (may take a bit)
zip archive.zip files/*
Since we read from /dev/urandom these don't compress well which is great for our test:
$ ls -lah archive.zip
-rw-r--r--  1 robherley  staff   9.9G Apr 25 12:21 archive.zip

This is uploaded to blob (with public access):
https://robstorage123.blob.core.windows.net/zip-test/archive.zip


## main.go
package main

import (
	"archive/zip"
	"context"
	"errors"
	"io"
	"log"

	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
)

func init() {
	log.SetFlags(log.LstdFlags | log.Lmicroseconds)
}

const (
	ArchiveURL = "https://robstorage123.blob.core.windows.net/zip-test/archive.zip"
	FileWeWant = "files/hello.txt"
)

var _ io.ReaderAt = &BlobReader{}

// BlobReader is an abstraction over the azblob client that implements io.ReaderAt using the Stream API with HTTPRange
type BlobReader struct {
	client *blob.Client
}

func NewBlobReader(client *blob.Client) *BlobReader {
	return &BlobReader{
		client: client,
	}
}

func (b *BlobReader) ReadAt(p []byte, off int64) (n int, err error) {
	httpRange := blob.HTTPRange{
		Offset: off,
		Count:  int64(len(p)),
	}

	log.Println("reading", httpRange.Count, "bytes from azblob from offset", httpRange.Offset)
	res, err := b.client.DownloadStream(context.Background(), &blob.DownloadStreamOptions{
		Range: httpRange,
	})
	if err != nil {
		return 0, err
	}

	defer res.Body.Close()

	return io.ReadFull(res.Body, p)
}

// getFileFromReader returns a file matching fileName from a zip.Reader
func getFileFromReader(archive *zip.Reader, fileName string) (*zip.File, error) {
	for _, file := range archive.File {
		if file.Name == fileName {
			return file, nil
		}
	}

	return nil, errors.New("file not found")
}

// readContent reads the content of a zip.File to a byte slice
// in the "real" implementation this would be sent to the http client making the request
func readContent(file *zip.File) ([]byte, error) {
	rc, err := file.Open()
	if err != nil {
		log.Fatal(err)
	}
	defer rc.Close()

	// in the "real" implementation we can split up the size of these reads to something safer
	content := make([]byte, file.UncompressedSize64)
	_, err = rc.Read(content)
	if err != nil {
		log.Fatal(err)
	}

	return content, nil
}

func main() {
	client, err := blob.NewClientWithNoCredential(ArchiveURL, nil)
	if err != nil {
		log.Fatalln(err)
	}

	blobProperties, err := client.GetProperties(context.Background(), nil)
	if err != nil {
		log.Fatalln(err)
	}

	log.Println("blob size is", *blobProperties.ContentLength, "bytes, which is ~", *blobProperties.ContentLength/1024/1024, "MB")

	br := NewBlobReader(client)

	blobArchive, err := zip.NewReader(br, *blobProperties.ContentLength)
	if err != nil {
		log.Fatalln(err)
	}

	log.Println("archive contains", len(blobArchive.File), "files")

	file, err := getFileFromReader(blobArchive, FileWeWant)
	if err != nil {
		log.Fatalln(err)
	}

	log.Println("found wanted file:", file.Name, "=>", file.UncompressedSize64, "bytes")

	contents, err := readContent(file)
	if err != nil {
		log.Fatalln(err)
	}

	log.Println("----start content from blob")
	println(string(contents))
	log.Println("----end content from blob")
}
	package main

	import (
	"archive/zip"
	"context"
	"errors"
	"io"
	"log"

	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
	)

	func init() {
	log.SetFlags(log.LstdFlags \| log.Lmicroseconds)
	}

	const (
	ArchiveURL = "https://robstorage123.blob.core.windows.net/zip-test/archive.zip"
	FileWeWant = "files/hello.txt"
	)

	var _ io.ReaderAt = &BlobReader{}

	// BlobReader is an abstraction over the azblob client that implements io.ReaderAt using the Stream API with HTTPRange
	type BlobReader struct {
	client *blob.Client
	}

	func NewBlobReader(client blob.Client) BlobReader {
	return &BlobReader{
	client: client,
	}
	}

	func (b *BlobReader) ReadAt(p []byte, off int64) (n int, err error) {
	httpRange := blob.HTTPRange{
	Offset: off,
	Count: int64(len(p)),
	}

	log.Println("reading", httpRange.Count, "bytes from azblob from offset", httpRange.Offset)
	res, err := b.client.DownloadStream(context.Background(), &blob.DownloadStreamOptions{
	Range: httpRange,
	})
	if err != nil {
	return 0, err
	}

	defer res.Body.Close()

	return io.ReadFull(res.Body, p)
	}

	// getFileFromReader returns a file matching fileName from a zip.Reader
	func getFileFromReader(archive zip.Reader, fileName string) (zip.File, error) {
	for _, file := range archive.File {
	if file.Name == fileName {
	return file, nil
	}
	}

	return nil, errors.New("file not found")
	}

	// readContent reads the content of a zip.File to a byte slice
	// in the "real" implementation this would be sent to the http client making the request
	func readContent(file *zip.File) ([]byte, error) {
	rc, err := file.Open()
	if err != nil {
	log.Fatal(err)
	}
	defer rc.Close()

	// in the "real" implementation we can split up the size of these reads to something safer
	content := make([]byte, file.UncompressedSize64)
	_, err = rc.Read(content)
	if err != nil {
	log.Fatal(err)
	}

	return content, nil
	}

	func main() {
	client, err := blob.NewClientWithNoCredential(ArchiveURL, nil)
	if err != nil {
	log.Fatalln(err)
	}

	blobProperties, err := client.GetProperties(context.Background(), nil)
	if err != nil {
	log.Fatalln(err)
	}

	log.Println("blob size is", blobProperties.ContentLength, "bytes, which is ~", blobProperties.ContentLength/1024/1024, "MB")

	br := NewBlobReader(client)

	blobArchive, err := zip.NewReader(br, *blobProperties.ContentLength)
	if err != nil {
	log.Fatalln(err)
	}

	log.Println("archive contains", len(blobArchive.File), "files")

	file, err := getFileFromReader(blobArchive, FileWeWant)
	if err != nil {
	log.Fatalln(err)
	}

	log.Println("found wanted file:", file.Name, "=>", file.UncompressedSize64, "bytes")

	contents, err := readContent(file)
	if err != nil {
	log.Fatalln(err)
	}

	log.Println("----start content from blob")
	println(string(contents))
	log.Println("----end content from blob")
	}