Skip to content

Instantly share code, notes, and snippets.

@7shi
Last active January 10, 2025 05:55
Show Gist options
  • Save 7shi/64c99dda8c9b413a92284be2960a47e2 to your computer and use it in GitHub Desktop.
Save 7shi/64c99dda8c9b413a92284be2960a47e2 to your computer and use it in GitHub Desktop.
[py]Analyzes images using llama3.2-vision with Ollama, then translates the results using aya-expanse.
import argparse, sys, glob, io
from PIL import Image
parser = argparse.ArgumentParser(description='Process an image with a language model.')
parser.add_argument('image_path', type=str, nargs='+', help='Path to the image file')
parser.add_argument('--translate', type=str, default='', help='Translate the response to the specified language (e.g. Japanese)')
parser.add_argument('--model', type=str, default='', help='Model to use (e.g. aya-expanse)')
parser.add_argument('--shrink', type=int, default=None, help='Maximum size of the image to shrink (e.g. 640)')
args = parser.parse_args()
if args.translate and not args.model:
print("Error: --model is required when --translate is specified.", file=sys.stderr)
sys.exit(1)
if args.shrink is not None and args.shrink < 16:
print("Error: --shrink must be greater than or equal to 16.", file=sys.stderr)
sys.exit(1)
image_paths = []
for pattern in args.image_path:
image_paths.extend(glob.glob(pattern))
import ollama
from datetime import timedelta
class Stats:
def __init__(self, last_chunk):
self.total_duration = last_chunk.total_duration / 1e9
self.load_duration = last_chunk.load_duration / 1e9
self.prompt_eval_count = last_chunk.prompt_eval_count
self.prompt_eval_duration = last_chunk.prompt_eval_duration / 1e9
self.eval_count = last_chunk.eval_count
self.eval_duration = last_chunk.eval_duration / 1e9
def show(self):
print("--- Statistics ---")
print("total duration: ", timedelta(seconds=self.total_duration))
print("load duration: ", timedelta(seconds=self.load_duration))
print("prompt eval count: ", self.prompt_eval_count)
print("prompt eval duration:", timedelta(seconds=self.prompt_eval_duration))
prompt_eval_rate = self.prompt_eval_count / self.prompt_eval_duration
print("prompt eval rate: ", f"{prompt_eval_rate:.2f} tokens/s")
print("eval_count: ", self.eval_count)
print("eval duration: ", timedelta(seconds=self.eval_duration))
eval_rate = self.eval_count / self.eval_duration
print("eval rate: ", f"{eval_rate:.2f} tokens/s")
def exec_stream(model, prompt, *images):
messages=[{"role": "user", "content": prompt}]
if images:
messages[0]["images"] = images
stream = ollama.chat(model=model, messages=messages, stream=True)
text = ""
for chunk in stream:
chunk_text = chunk['message']['content']
text += chunk_text
print(chunk_text, end='', flush=True)
if not text.endswith("\n"):
print()
return text.rstrip(), Stats(chunk)
def shrink_image(image_path, max_size):
if max_size is None:
with open(image_path, "rb") as f:
return f.read()
image = Image.open(image_path)
w1, h1 = image.size
if max(w1, h1) > max_size:
if w1 > h1:
w2 = max_size
h2 = int(h1 * (max_size / w1))
else:
w2 = int(w1 * (max_size / h1))
h2 = max_size
image = image.resize((w2, h2), resample=Image.LANCZOS)
buf = io.BytesIO()
image.save(buf, format="PNG")
return buf.getvalue()
texts = []
for i, image_path in enumerate(image_paths):
if i:
print()
print(f"Image {i+1}/{len(image_paths)}: {image_path}")
print()
text, st = exec_stream(
"llama3.2-vision",
f"What is in this image?",
shrink_image(image_path, args.shrink)
)
texts.append(text)
print()
st.show()
if args.translate:
for i, text in enumerate(texts):
print()
print(f"Image {i+1}/{len(image_paths)}: {image_paths[i]}")
print()
_, st = exec_stream(args.model, " ".join([
"Output only the result, without any explanation.",
"Keep the markdown.",
f"Translate into {args.translate}:\n{text}"
]))
print()
st.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment