sborquez/tar_image_iterator.py

## tar_image_iterator.py
import io
from typing import Any, Callable, Iterator, Tuple, Union

import cv2
from google.cloud import storage
import numpy as np
from PIL import Image
import tarfile


def reraise_exception(exc: Any) -> None:
    raise exc


def tar_file_iterator(uri: str, skip_meta: str=r"__[^/]*__($|/)", handler: Callable[Any] = reraise_exception) -> Iterator[Tuple[str, bytes]]:
    try:
        # Split the bucket name and object name from the URI
        bucket_name, object_name = uri.replace("gs://", "").split("/", 1)
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.get_blob(object_name)
        tar_file = io.BytesIO(blob.download_as_string())
        tar_file.seek(0)
        with tarfile.open(fileobj=_file) as tar:
            for tar_info in tar.getmembers():
                if tar_info.isfile() and not re.search(skip_meta, tar_info.name):
                    with tar.extractfile(tar_info) as f:
                        yield tar_info.name, f.read()
    except Exception as exc:
        handler(exc)


def load_images_from_iterator(iterator: Iterator[Tuple[str, bytes]], use_pil: bool = False) -> Dict[str, Union[np.ndarray, PIL.Image]]:
    image_dict = {}
    if use_pil:
        load_image = lambda content: Image.open(io.BytesIO(content))
    else:
        load_image = lambda content: cv2.imdecode(np.frombuffer(content, np.uint8), cv2.IMREAD_COLOR)
    for filename, content in iterator:
        try:
            # Check if the file is a JPG image (you can add more robust checks if needed)
            if not filename.lower().endswith('.jpg'):
                continue
            image_dict[filename] = load_image(content)
        except Exception as e:
            print(f"Failed to load image from {filename}: {str(e)}")

    return image_dict
	import io
	from typing import Any, Callable, Iterator, Tuple, Union

	import cv2
	from google.cloud import storage
	import numpy as np
	from PIL import Image
	import tarfile


	def reraise_exception(exc: Any) -> None:
	raise exc


	def tar_file_iterator(uri: str, skip_meta: str=r"__[^/]*__($\|/)", handler: Callable[Any] = reraise_exception) -> Iterator[Tuple[str, bytes]]:
	try:
	# Split the bucket name and object name from the URI
	bucket_name, object_name = uri.replace("gs://", "").split("/", 1)
	client = storage.Client()
	bucket = client.bucket(bucket_name)
	blob = bucket.get_blob(object_name)
	tar_file = io.BytesIO(blob.download_as_string())
	tar_file.seek(0)
	with tarfile.open(fileobj=_file) as tar:
	for tar_info in tar.getmembers():
	if tar_info.isfile() and not re.search(skip_meta, tar_info.name):
	with tar.extractfile(tar_info) as f:
	yield tar_info.name, f.read()
	except Exception as exc:
	handler(exc)


	def load_images_from_iterator(iterator: Iterator[Tuple[str, bytes]], use_pil: bool = False) -> Dict[str, Union[np.ndarray, PIL.Image]]:
	image_dict = {}
	if use_pil:
	load_image = lambda content: Image.open(io.BytesIO(content))
	else:
	load_image = lambda content: cv2.imdecode(np.frombuffer(content, np.uint8), cv2.IMREAD_COLOR)
	for filename, content in iterator:
	try:
	# Check if the file is a JPG image (you can add more robust checks if needed)
	if not filename.lower().endswith('.jpg'):
	continue
	image_dict[filename] = load_image(content)
	except Exception as e:
	print(f"Failed to load image from {filename}: {str(e)}")

	return image_dict