Last active
January 10, 2025 05:55
-
-
Save 7shi/64c99dda8c9b413a92284be2960a47e2 to your computer and use it in GitHub Desktop.
[py]Analyzes images using llama3.2-vision with Ollama, then translates the results using aya-expanse.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse, sys, glob, io | |
from PIL import Image | |
parser = argparse.ArgumentParser(description='Process an image with a language model.') | |
parser.add_argument('image_path', type=str, nargs='+', help='Path to the image file') | |
parser.add_argument('--translate', type=str, default='', help='Translate the response to the specified language (e.g. Japanese)') | |
parser.add_argument('--model', type=str, default='', help='Model to use (e.g. aya-expanse)') | |
parser.add_argument('--shrink', type=int, default=None, help='Maximum size of the image to shrink (e.g. 640)') | |
args = parser.parse_args() | |
if args.translate and not args.model: | |
print("Error: --model is required when --translate is specified.", file=sys.stderr) | |
sys.exit(1) | |
if args.shrink is not None and args.shrink < 16: | |
print("Error: --shrink must be greater than or equal to 16.", file=sys.stderr) | |
sys.exit(1) | |
image_paths = [] | |
for pattern in args.image_path: | |
image_paths.extend(glob.glob(pattern)) | |
import ollama | |
from datetime import timedelta | |
class Stats: | |
def __init__(self, last_chunk): | |
self.total_duration = last_chunk.total_duration / 1e9 | |
self.load_duration = last_chunk.load_duration / 1e9 | |
self.prompt_eval_count = last_chunk.prompt_eval_count | |
self.prompt_eval_duration = last_chunk.prompt_eval_duration / 1e9 | |
self.eval_count = last_chunk.eval_count | |
self.eval_duration = last_chunk.eval_duration / 1e9 | |
def show(self): | |
print("--- Statistics ---") | |
print("total duration: ", timedelta(seconds=self.total_duration)) | |
print("load duration: ", timedelta(seconds=self.load_duration)) | |
print("prompt eval count: ", self.prompt_eval_count) | |
print("prompt eval duration:", timedelta(seconds=self.prompt_eval_duration)) | |
prompt_eval_rate = self.prompt_eval_count / self.prompt_eval_duration | |
print("prompt eval rate: ", f"{prompt_eval_rate:.2f} tokens/s") | |
print("eval_count: ", self.eval_count) | |
print("eval duration: ", timedelta(seconds=self.eval_duration)) | |
eval_rate = self.eval_count / self.eval_duration | |
print("eval rate: ", f"{eval_rate:.2f} tokens/s") | |
def exec_stream(model, prompt, *images): | |
messages=[{"role": "user", "content": prompt}] | |
if images: | |
messages[0]["images"] = images | |
stream = ollama.chat(model=model, messages=messages, stream=True) | |
text = "" | |
for chunk in stream: | |
chunk_text = chunk['message']['content'] | |
text += chunk_text | |
print(chunk_text, end='', flush=True) | |
if not text.endswith("\n"): | |
print() | |
return text.rstrip(), Stats(chunk) | |
def shrink_image(image_path, max_size): | |
if max_size is None: | |
with open(image_path, "rb") as f: | |
return f.read() | |
image = Image.open(image_path) | |
w1, h1 = image.size | |
if max(w1, h1) > max_size: | |
if w1 > h1: | |
w2 = max_size | |
h2 = int(h1 * (max_size / w1)) | |
else: | |
w2 = int(w1 * (max_size / h1)) | |
h2 = max_size | |
image = image.resize((w2, h2), resample=Image.LANCZOS) | |
buf = io.BytesIO() | |
image.save(buf, format="PNG") | |
return buf.getvalue() | |
texts = [] | |
for i, image_path in enumerate(image_paths): | |
if i: | |
print() | |
print(f"Image {i+1}/{len(image_paths)}: {image_path}") | |
print() | |
text, st = exec_stream( | |
"llama3.2-vision", | |
f"What is in this image?", | |
shrink_image(image_path, args.shrink) | |
) | |
texts.append(text) | |
print() | |
st.show() | |
if args.translate: | |
for i, text in enumerate(texts): | |
print() | |
print(f"Image {i+1}/{len(image_paths)}: {image_paths[i]}") | |
print() | |
_, st = exec_stream(args.model, " ".join([ | |
"Output only the result, without any explanation.", | |
"Keep the markdown.", | |
f"Translate into {args.translate}:\n{text}" | |
])) | |
print() | |
st.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment