Skip to content

Instantly share code, notes, and snippets.

@sborquez
Last active April 8, 2024 15:10
Show Gist options
  • Save sborquez/d1e7d142b96789fc5f23bf9783559daa to your computer and use it in GitHub Desktop.
Save sborquez/d1e7d142b96789fc5f23bf9783559daa to your computer and use it in GitHub Desktop.
Load images from tar file without uncompress it
import io
from typing import Any, Callable, Iterator, Tuple, Union
import cv2
from google.cloud import storage
import numpy as np
from PIL import Image
import tarfile
def reraise_exception(exc: Any) -> None:
raise exc
def tar_file_iterator(uri: str, skip_meta: str=r"__[^/]*__($|/)", handler: Callable[Any] = reraise_exception) -> Iterator[Tuple[str, bytes]]:
try:
# Split the bucket name and object name from the URI
bucket_name, object_name = uri.replace("gs://", "").split("/", 1)
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.get_blob(object_name)
tar_file = io.BytesIO(blob.download_as_string())
tar_file.seek(0)
with tarfile.open(fileobj=_file) as tar:
for tar_info in tar.getmembers():
if tar_info.isfile() and not re.search(skip_meta, tar_info.name):
with tar.extractfile(tar_info) as f:
yield tar_info.name, f.read()
except Exception as exc:
handler(exc)
def load_images_from_iterator(iterator: Iterator[Tuple[str, bytes]], use_pil: bool = False) -> Dict[str, Union[np.ndarray, PIL.Image]]:
image_dict = {}
if use_pil:
load_image = lambda content: Image.open(io.BytesIO(content))
else:
load_image = lambda content: cv2.imdecode(np.frombuffer(content, np.uint8), cv2.IMREAD_COLOR)
for filename, content in iterator:
try:
# Check if the file is a JPG image (you can add more robust checks if needed)
if not filename.lower().endswith('.jpg'):
continue
image_dict[filename] = load_image(content)
except Exception as e:
print(f"Failed to load image from {filename}: {str(e)}")
return image_dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment