Last active
April 8, 2024 15:10
-
-
Save sborquez/d1e7d142b96789fc5f23bf9783559daa to your computer and use it in GitHub Desktop.
Load images from tar file without uncompress it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
from typing import Any, Callable, Iterator, Tuple, Union | |
import cv2 | |
from google.cloud import storage | |
import numpy as np | |
from PIL import Image | |
import tarfile | |
def reraise_exception(exc: Any) -> None: | |
raise exc | |
def tar_file_iterator(uri: str, skip_meta: str=r"__[^/]*__($|/)", handler: Callable[Any] = reraise_exception) -> Iterator[Tuple[str, bytes]]: | |
try: | |
# Split the bucket name and object name from the URI | |
bucket_name, object_name = uri.replace("gs://", "").split("/", 1) | |
client = storage.Client() | |
bucket = client.bucket(bucket_name) | |
blob = bucket.get_blob(object_name) | |
tar_file = io.BytesIO(blob.download_as_string()) | |
tar_file.seek(0) | |
with tarfile.open(fileobj=_file) as tar: | |
for tar_info in tar.getmembers(): | |
if tar_info.isfile() and not re.search(skip_meta, tar_info.name): | |
with tar.extractfile(tar_info) as f: | |
yield tar_info.name, f.read() | |
except Exception as exc: | |
handler(exc) | |
def load_images_from_iterator(iterator: Iterator[Tuple[str, bytes]], use_pil: bool = False) -> Dict[str, Union[np.ndarray, PIL.Image]]: | |
image_dict = {} | |
if use_pil: | |
load_image = lambda content: Image.open(io.BytesIO(content)) | |
else: | |
load_image = lambda content: cv2.imdecode(np.frombuffer(content, np.uint8), cv2.IMREAD_COLOR) | |
for filename, content in iterator: | |
try: | |
# Check if the file is a JPG image (you can add more robust checks if needed) | |
if not filename.lower().endswith('.jpg'): | |
continue | |
image_dict[filename] = load_image(content) | |
except Exception as e: | |
print(f"Failed to load image from {filename}: {str(e)}") | |
return image_dict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment