Skip to content

Instantly share code, notes, and snippets.

@simonw
Created August 27, 2024 21:33
Show Gist options
  • Save simonw/d77ab3ef497ed79f353633322cc6d38a to your computer and use it in GitHub Desktop.
Save simonw/d77ab3ef497ed79f353633322cc6d38a to your computer and use it in GitHub Desktop.

Gemini API Image Bounding Box Visualization Tool

human (Aug 27, 2024, 01:51 PM)

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini API Image Bounding Box Visualization</title>
    <script type="module">
        import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
        import { marked } from "https://esm.run/marked";

        function getApiKey() {
          let apiKey = localStorage.getItem("GEMINI_API_KEY");
          if (!apiKey) {
            apiKey = prompt("Please enter your Gemini API key:");
            if (apiKey) {
              localStorage.setItem("GEMINI_API_KEY", apiKey);
            }
          }
          return apiKey;
        }

        async function getGenerativeModel(params) {
            const API_KEY = getApiKey();
            const genAI = new GoogleGenerativeAI(API_KEY);
            return genAI.getGenerativeModel(params);
        }

        async function fileToGenerativePart(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onloadend = () => resolve({
                    inlineData: { 
                        data: reader.result.split(",")[1],
                        mimeType: file.type 
                    }
                });
                reader.readAsDataURL(file);
            });
        }

        function resizeAndCompressImage(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onload = function(event) {
                    const img = new Image();
                    img.onload = function() {
                        const canvas = document.createElement('canvas');
                        const ctx = canvas.getContext('2d');

                        let width = img.width;
                        let height = img.height;

                        if (width > 1000) {
                            height = Math.round((height * 1000) / width);
                            width = 1000;
                        }

                        canvas.width = width;
                        canvas.height = height;

                        ctx.drawImage(img, 0, 0, width, height);

                        canvas.toBlob((blob) => {
                            resolve(new File([blob], "compressed_image.jpg", { type: "image/jpeg" }));
                        }, 'image/jpeg', 0.7);
                    };
                    img.src = event.target.result;
                };
                reader.readAsDataURL(file);
            });
        }

        async function processImageAndPrompt() {
            const fileInput = document.getElementById('imageInput');
            const promptInput = document.getElementById('promptInput');
            const resultDiv = document.getElementById('result');
            
            if (!fileInput.files[0] || !promptInput.value) {
                alert('Please select an image and enter a prompt.');
                return;
            }

            resultDiv.innerHTML = 'Processing...';
            
            try {
                const model = await getGenerativeModel({ model: "gemini-1.5-pro" });
                const compressedImage = await resizeAndCompressImage(fileInput.files[0]);
                const imagePart = await fileToGenerativePart(compressedImage);
                
                const result = await model.generateContent([promptInput.value, imagePart]);
                const response = await result.response;
                const text = response.text();
                
                resultDiv.innerHTML = marked.parse(text);

                // Extract coordinates from the response
                const coordinates = extractCoordinates(text);
                if (coordinates.length > 0) {
                    displayImageWithBoundingBoxes(compressedImage, coordinates);
                }
            } catch (error) {
                resultDiv.innerHTML = `Error: ${error.message}`;
            }
        }

        function extractCoordinates(text) {
            const regex = /\[\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\]/g;
            const matches = text.match(regex) || [];
            return matches.map(JSON.parse);
        }

        function displayImageWithBoundingBoxes(file, coordinates) {
            const reader = new FileReader();
            reader.onload = function(event) {
                const image = new Image();
                image.onload = function() {
                    const canvas = document.getElementById('canvas');
                    canvas.width = image.width + 100;
                    canvas.height = image.height + 100;
                    const ctx = canvas.getContext('2d');
                    
                    // Draw the image
                    ctx.drawImage(image, 80, 20);

                    // Draw grid lines
                    ctx.strokeStyle = 'rgba(255, 0, 0, 0.5)';  // Red with 50% opacity
                    ctx.lineWidth = 1;

                    // Vertical grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.beginPath();
                        ctx.moveTo(x, 20);
                        ctx.lineTo(x, image.height + 20);
                        ctx.stroke();
                    }

                    // Horizontal grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + (1000 - i) / 1000 * image.height;
                        ctx.beginPath();
                        ctx.moveTo(80, y);
                        ctx.lineTo(image.width + 80, y);
                        ctx.stroke();
                    }

                    // Draw bounding boxes
                    const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
                    coordinates.forEach((box, index) => {
                        const [ymin, xmin, ymax, xmax] = box.map(coord => coord / 1000);
                        
                        const width = (xmax - xmin) * image.width;
                        const height = (ymax - ymin) * image.height;

                        ctx.strokeStyle = colors[index % colors.length];
                        ctx.lineWidth = 5;
                        ctx.strokeRect(xmin * image.width + 80, ymin * image.height + 20, width, height);
                    });

                    // Draw axes and labels
                    ctx.strokeStyle = '#000000';
                    ctx.lineWidth = 1;
                    ctx.font = '26px Arial';
                    ctx.textAlign = 'right';

                    // Y-axis
                    ctx.beginPath();
                    ctx.moveTo(80, 20);
                    ctx.lineTo(80, image.height + 20);
                    ctx.stroke();

                    // Y-axis labels and ticks
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + i / 1000 * image.height;
                        ctx.fillText(i.toString(), 75, y + 5);
                        ctx.beginPath();
                        ctx.moveTo(75, y);
                        ctx.lineTo(80, y);
                        ctx.stroke();
                    }

                    // X-axis
                    ctx.beginPath();
                    ctx.moveTo(80, image.height + 20);
                    ctx.lineTo(image.width + 80, image.height + 20);
                    ctx.stroke();

                    // X-axis labels and ticks
                    ctx.textAlign = 'center';
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.fillText(i.toString(), x, image.height + 40);
                        ctx.beginPath();
                        ctx.moveTo(x, image.height + 20);
                        ctx.lineTo(x, image.height + 25);
                        ctx.stroke();
                    }
                };
                image.src = event.target.result;
            };
            reader.readAsDataURL(file);
        }

        // Attach event listener to the submit button
        document.getElementById('submitBtn').addEventListener('click', processImageAndPrompt);
    </script>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        textarea {
            width: 100%;
            height: 100px;
        }
        #result, #imageContainer {
            margin-top: 20px;
            border: 1px solid #ccc;
            padding: 10px;
        }
        #canvas {
            max-width: 100%;
            height: auto;
        }
    </style>
</head>
<body>
    <h1>Gemini API Image Bounding Box Visualization</h1>
    <input type="file" id="imageInput" accept="image/*">
    <textarea id="promptInput">Return bounding boxes as JSON arrays [ymin, xmin, ymax, xmax]
</textarea>
    <button id="submitBtn">Process</button>
    <div id="result"></div>
    <div id="imageContainer">
        <canvas id="canvas"></canvas>
    </div>
</body>
</html>

Add the ability to select from a list of models - keep the current one as the default and add options for gemini-1.5-flash-8b-exp-0827 and gemini-1.5-pro-exp-0827 and gemini-1.5-flash-exp-0827

Make sure image upload is optional, if no image set then it runs the prompt without the image

Add a clear image link

When image is selected it should be shown in the area at the bottom where

assistant (Aug 27, 2024, 01:51 PM)

Certainly, I'll modify the HTML to include these new features. Here's the updated version:

Gemini API Image Bounding Box Visualizer

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini API Image Bounding Box Visualization</title>
    <script type="module">
        import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
        import { marked } from "https://esm.run/marked";

        function getApiKey() {
          let apiKey = localStorage.getItem("GEMINI_API_KEY");
          if (!apiKey) {
            apiKey = prompt("Please enter your Gemini API key:");
            if (apiKey) {
              localStorage.setItem("GEMINI_API_KEY", apiKey);
            }
          }
          return apiKey;
        }

        async function getGenerativeModel(params) {
            const API_KEY = getApiKey();
            const genAI = new GoogleGenerativeAI(API_KEY);
            return genAI.getGenerativeModel(params);
        }

        async function fileToGenerativePart(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onloadend = () => resolve({
                    inlineData: { 
                        data: reader.result.split(",")[1],
                        mimeType: file.type 
                    }
                });
                reader.readAsDataURL(file);
            });
        }

        function resizeAndCompressImage(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onload = function(event) {
                    const img = new Image();
                    img.onload = function() {
                        const canvas = document.createElement('canvas');
                        const ctx = canvas.getContext('2d');

                        let width = img.width;
                        let height = img.height;

                        if (width > 1000) {
                            height = Math.round((height * 1000) / width);
                            width = 1000;
                        }

                        canvas.width = width;
                        canvas.height = height;

                        ctx.drawImage(img, 0, 0, width, height);

                        canvas.toBlob((blob) => {
                            resolve(new File([blob], "compressed_image.jpg", { type: "image/jpeg" }));
                        }, 'image/jpeg', 0.7);
                    };
                    img.src = event.target.result;
                };
                reader.readAsDataURL(file);
            });
        }

        async function processImageAndPrompt() {
            const fileInput = document.getElementById('imageInput');
            const promptInput = document.getElementById('promptInput');
            const resultDiv = document.getElementById('result');
            const modelSelect = document.getElementById('modelSelect');
            
            if (!promptInput.value) {
                alert('Please enter a prompt.');
                return;
            }

            resultDiv.innerHTML = 'Processing...';
            
            try {
                const model = await getGenerativeModel({ model: modelSelect.value });
                let content = [promptInput.value];

                if (fileInput.files[0]) {
                    const compressedImage = await resizeAndCompressImage(fileInput.files[0]);
                    const imagePart = await fileToGenerativePart(compressedImage);
                    content.push(imagePart);
                }
                
                const result = await model.generateContent(content);
                const response = await result.response;
                const text = response.text();
                
                resultDiv.innerHTML = marked.parse(text);

                if (fileInput.files[0]) {
                    // Extract coordinates from the response
                    const coordinates = extractCoordinates(text);
                    if (coordinates.length > 0) {
                        displayImageWithBoundingBoxes(fileInput.files[0], coordinates);
                    }
                }
            } catch (error) {
                resultDiv.innerHTML = `Error: ${error.message}`;
            }
        }

        function extractCoordinates(text) {
            const regex = /\[\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\]/g;
            const matches = text.match(regex) || [];
            return matches.map(JSON.parse);
        }

        function displayImageWithBoundingBoxes(file, coordinates) {
            const reader = new FileReader();
            reader.onload = function(event) {
                const image = new Image();
                image.onload = function() {
                    const canvas = document.getElementById('canvas');
                    canvas.width = image.width + 100;
                    canvas.height = image.height + 100;
                    const ctx = canvas.getContext('2d');
                    
                    // Draw the image
                    ctx.drawImage(image, 80, 20);

                    // Draw grid lines
                    ctx.strokeStyle = 'rgba(255, 0, 0, 0.5)';  // Red with 50% opacity
                    ctx.lineWidth = 1;

                    // Vertical grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.beginPath();
                        ctx.moveTo(x, 20);
                        ctx.lineTo(x, image.height + 20);
                        ctx.stroke();
                    }

                    // Horizontal grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + (1000 - i) / 1000 * image.height;
                        ctx.beginPath();
                        ctx.moveTo(80, y);
                        ctx.lineTo(image.width + 80, y);
                        ctx.stroke();
                    }

                    // Draw bounding boxes
                    const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
                    coordinates.forEach((box, index) => {
                        const [ymin, xmin, ymax, xmax] = box.map(coord => coord / 1000);
                        
                        const width = (xmax - xmin) * image.width;
                        const height = (ymax - ymin) * image.height;

                        ctx.strokeStyle = colors[index % colors.length];
                        ctx.lineWidth = 5;
                        ctx.strokeRect(xmin * image.width + 80, ymin * image.height + 20, width, height);
                    });

                    // Draw axes and labels
                    ctx.strokeStyle = '#000000';
                    ctx.lineWidth = 1;
                    ctx.font = '26px Arial';
                    ctx.textAlign = 'right';

                    // Y-axis
                    ctx.beginPath();
                    ctx.moveTo(80, 20);
                    ctx.lineTo(80, image.height + 20);
                    ctx.stroke();

                    // Y-axis labels and ticks
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + i / 1000 * image.height;
                        ctx.fillText(i.toString(), 75, y + 5);
                        ctx.beginPath();
                        ctx.moveTo(75, y);
                        ctx.lineTo(80, y);
                        ctx.stroke();
                    }

                    // X-axis
                    ctx.beginPath();
                    ctx.moveTo(80, image.height + 20);
                    ctx.lineTo(image.width + 80, image.height + 20);
                    ctx.stroke();

                    // X-axis labels and ticks
                    ctx.textAlign = 'center';
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.fillText(i.toString(), x, image.height + 40);
                        ctx.beginPath();
                        ctx.moveTo(x, image.height + 20);
                        ctx.lineTo(x, image.height + 25);
                        ctx.stroke();
                    }
                };
                image.src = event.target.result;
            };
            reader.readAsDataURL(file);
        }

        function clearImage() {
            document.getElementById('imageInput').value = '';
            document.getElementById('canvas').getContext('2d').clearRect(0, 0, canvas.width, canvas.height);
            document.getElementById('imagePreview').src = '';
            document.getElementById('imagePreview').style.display = 'none';
        }

        function previewImage(event) {
            const file = event.target.files[0];
            if (file) {
                const reader = new FileReader();
                reader.onload = function(e) {
                    const img = document.getElementById('imagePreview');
                    img.src = e.target.result;
                    img.style.display = 'block';
                }
                reader.readAsDataURL(file);
            }
        }

        // Attach event listeners
        document.getElementById('submitBtn').addEventListener('click', processImageAndPrompt);
        document.getElementById('clearImageBtn').addEventListener('click', clearImage);
        document.getElementById('imageInput').addEventListener('change', previewImage);
    </script>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        textarea {
            width: 100%;
            height: 100px;
        }
        #result, #imageContainer {
            margin-top: 20px;
            border: 1px solid #ccc;
            padding: 10px;
        }
        #canvas {
            max-width: 100%;
            height: auto;
        }
        #imagePreview {
            max-width: 100%;
            display: none;
            margin-top: 10px;
        }
    </style>
</head>
<body>
    <h1>Gemini API Image Bounding Box Visualization</h1>
    <select id="modelSelect">
        <option value="gemini-1.5-pro">gemini-1.5-pro (default)</option>
        <option value="gemini-1.5-flash-8b-exp-0827">gemini-1.5-flash-8b-exp-0827</option>
        <option value="gemini-1.5-pro-exp-0827">gemini-1.5-pro-exp-0827</option>
        <option value="gemini-1.5-flash-exp-0827">gemini-1.5-flash-exp-0827</option>
    </select>
    <input type="file" id="imageInput" accept="image/*">
    <button id="clearImageBtn">Clear Image</button>
    <textarea id="promptInput">Return bounding boxes as JSON arrays [ymin, xmin, ymax, xmax]
</textarea>
    <button id="submitBtn">Process</button>
    <div id="result"></div>
    <div id="imageContainer">
        <img id="imagePreview" alt="Image preview" />
        <canvas id="canvas"></canvas>
    </div>
</body>
</html>

I've made the following changes to the HTML:

  1. Added a select element with options for different Gemini models:

    • gemini-1.5-pro (default)
    • gemini-1.5-flash-8b-exp-0827
    • gemini-1.5-pro-exp-0827
    • gemini-1.5-flash-exp-0827
  2. Modified the processImageAndPrompt function to make the image upload optional:

    • It now checks if an image is selected before processing it.
    • If no image is selected, it runs the prompt without the image.
  3. Added a "Clear Image" button:

    • This button clears the selected image, the canvas, and the image preview.
  4. Added an image preview feature:

    • When an image is selected, it's displayed in the area below the input fields.
    • The previewImage function handles this functionality.
  5. Updated the HTML structure to include the new elements:

    • Added the model select dropdown.
    • Added the "Clear Image" button.
    • Added an img element for the image preview.

These changes allow users to select different Gemini models, use the tool with or without an image, clear the selected image easily, and see a preview of the selected image before processing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment