homedirectory/multimodal.py

## multimodal.py
import gradio as gr
from PIL import Image
import torch

from transformers import AutoProcessor, AutoModelForCausalLM

processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa", cache_dir='/tmp')

def doit(image1, image2, question):
    # concatenate the images side-by-side
    image = Image.new('RGB', (image1.width + image2.width, image1.height))
    image.paste(image1, (0, 0))
    image.paste(image2, (image1.width, 0))

    # prepare the question
    question = "Given these 2 images side-by-side, " + question

    # actual work
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    input_ids = processor(text=question, add_special_tokens=False).input_ids
    input_ids = [processor.tokenizer.cls_token_id] + input_ids
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=500)
    answer = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return "\n".join(list(answer))

gradio_app = gr.Interface(
    doit,
    inputs=[gr.Image(label="Select an image", sources=['upload', 'webcam'], type="pil"),
            gr.Image(label="Select an image", sources=['upload', 'webcam'], type="pil"),
            gr.Textbox(placeholder="Enter a question")],
    outputs=[gr.Label(label="Answer")],
    title="Question about 2 images",
)

if __name__ == "__main__":
    gradio_app.launch()
	import gradio as gr
	from PIL import Image
	import torch

	from transformers import AutoProcessor, AutoModelForCausalLM

	processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
	model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa", cache_dir='/tmp')

	def doit(image1, image2, question):
	# concatenate the images side-by-side
	image = Image.new('RGB', (image1.width + image2.width, image1.height))
	image.paste(image1, (0, 0))
	image.paste(image2, (image1.width, 0))

	# prepare the question
	question = "Given these 2 images side-by-side, " + question

	# actual work
	pixel_values = processor(images=image, return_tensors="pt").pixel_values
	input_ids = processor(text=question, add_special_tokens=False).input_ids
	input_ids = [processor.tokenizer.cls_token_id] + input_ids
	input_ids = torch.tensor(input_ids).unsqueeze(0)
	generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=500)
	answer = processor.batch_decode(generated_ids, skip_special_tokens=True)

	return "\n".join(list(answer))

	gradio_app = gr.Interface(
	doit,
	inputs=[gr.Image(label="Select an image", sources=['upload', 'webcam'], type="pil"),
	gr.Image(label="Select an image", sources=['upload', 'webcam'], type="pil"),
	gr.Textbox(placeholder="Enter a question")],
	outputs=[gr.Label(label="Answer")],
	title="Question about 2 images",
	)

	if __name__ == "__main__":
	gradio_app.launch()