Skip to content

Instantly share code, notes, and snippets.

@homedirectory
Created March 30, 2024 12:00
Show Gist options
  • Save homedirectory/2047fecca2d44a19688c950875ca94f9 to your computer and use it in GitHub Desktop.
Save homedirectory/2047fecca2d44a19688c950875ca94f9 to your computer and use it in GitHub Desktop.
Example of multimodal models from Hugging Face
import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa", cache_dir='/tmp')
def doit(image1, image2, question):
# concatenate the images side-by-side
image = Image.new('RGB', (image1.width + image2.width, image1.height))
image.paste(image1, (0, 0))
image.paste(image2, (image1.width, 0))
# prepare the question
question = "Given these 2 images side-by-side, " + question
# actual work
pixel_values = processor(images=image, return_tensors="pt").pixel_values
input_ids = processor(text=question, add_special_tokens=False).input_ids
input_ids = [processor.tokenizer.cls_token_id] + input_ids
input_ids = torch.tensor(input_ids).unsqueeze(0)
generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=500)
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)
return "\n".join(list(answer))
gradio_app = gr.Interface(
doit,
inputs=[gr.Image(label="Select an image", sources=['upload', 'webcam'], type="pil"),
gr.Image(label="Select an image", sources=['upload', 'webcam'], type="pil"),
gr.Textbox(placeholder="Enter a question")],
outputs=[gr.Label(label="Answer")],
title="Question about 2 images",
)
if __name__ == "__main__":
gradio_app.launch()
@homedirectory
Copy link
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment