Skip to content

Instantly share code, notes, and snippets.

@simonw

simonw/bbox.md Secret

Created October 3, 2024 22:24
Show Gist options
  • Save simonw/b96c9f597beacb0e5bc914e495a922f1 to your computer and use it in GitHub Desktop.
Save simonw/b96c9f597beacb0e5bc914e495a922f1 to your computer and use it in GitHub Desktop.

Modify Image Annotation Tool

human (Oct 3, 2024, 03:17 PM)

Alter this tool. When the "processing" button is clicked any annotated images on the page should be removed - but the image showing the file that was selected (if one was selected) should stay. Add a "clear image" button that both clears the file input value and removes the preview of that image.

When an image is displayed with bounding boxes, also perform an operation on each bounding box to extract that exact area as a new image and append that image to the page in a sequence with all of the other images - each image should be preceeded by a title that shows the bounding box coordinates for that image. These should be cleared when a new prompt is executed as well.

assistant (Oct 3, 2024, 03:17 PM)

Certainly. I'll modify the code to implement these changes. Here's the updated HTML file with the requested modifications:

Modified Gemini API Image Bounding Box Visualization Tool

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini API Image Bounding Box Visualization</title>
    <script type="module">
        import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
        import { marked } from "https://esm.run/marked";

        function getApiKey() {
          let apiKey = localStorage.getItem("GEMINI_API_KEY");
          if (!apiKey) {
            apiKey = prompt("Please enter your Gemini API key:");
            if (apiKey) {
              localStorage.setItem("GEMINI_API_KEY", apiKey);
            }
          }
          return apiKey;
        }

        async function getGenerativeModel(params) {
            const API_KEY = getApiKey();
            const genAI = new GoogleGenerativeAI(API_KEY);
            return genAI.getGenerativeModel(params);
        }

        async function fileToGenerativePart(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onloadend = () => resolve({
                    inlineData: { 
                        data: reader.result.split(",")[1],
                        mimeType: file.type 
                    }
                });
                reader.readAsDataURL(file);
            });
        }

        function resizeAndCompressImage(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onload = function(event) {
                    const img = new Image();
                    img.onload = function() {
                        const canvas = document.createElement('canvas');
                        const ctx = canvas.getContext('2d');

                        let width = img.width;
                        let height = img.height;

                        if (width > 1000) {
                            height = Math.round((height * 1000) / width);
                            width = 1000;
                        }

                        canvas.width = width;
                        canvas.height = height;

                        ctx.drawImage(img, 0, 0, width, height);

                        canvas.toBlob((blob) => {
                            resolve(new File([blob], "compressed_image.jpg", { type: "image/jpeg" }));
                        }, 'image/jpeg', 0.7);
                    };
                    img.src = event.target.result;
                };
                reader.readAsDataURL(file);
            });
        }

        async function processImageAndPrompt() {
            const fileInput = document.getElementById('imageInput');
            const promptInput = document.getElementById('promptInput');
            const resultDiv = document.getElementById('result');
            const modelSelect = document.getElementById('modelSelect');
            
            if (!promptInput.value) {
                alert('Please enter a prompt.');
                return;
            }

            resultDiv.innerHTML = 'Processing...';
            
            // Clear previous bounding box images
            document.getElementById('boundingBoxImages').innerHTML = '';
            
            try {
                const model = await getGenerativeModel({ model: modelSelect.value });
                let content = [promptInput.value];

                if (fileInput.files[0]) {
                    const compressedImage = await resizeAndCompressImage(fileInput.files[0]);
                    const imagePart = await fileToGenerativePart(compressedImage);
                    content.push(imagePart);
                }
                
                const result = await model.generateContent(content);
                const response = await result.response;
                const text = response.text();
                
                resultDiv.innerHTML = marked.parse(text);

                if (fileInput.files[0]) {
                    // Extract coordinates from the response
                    const coordinates = extractCoordinates(text);
                    if (coordinates.length > 0) {
                        displayImageWithBoundingBoxes(fileInput.files[0], coordinates);
                    }
                }
            } catch (error) {
                resultDiv.innerHTML = `Error: ${error.message}`;
            }
        }

        function extractCoordinates(text) {
            const regex = /\[\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\]/g;
            const matches = text.match(regex) || [];
            return matches.map(JSON.parse);
        }

        function displayImageWithBoundingBoxes(file, coordinates) {
            const reader = new FileReader();
            reader.onload = function(event) {
                const image = new Image();
                image.onload = function() {
                    const canvas = document.getElementById('canvas');
                    canvas.width = image.width + 100;
                    canvas.height = image.height + 100;
                    const ctx = canvas.getContext('2d');
                    
                    // Draw the image
                    ctx.drawImage(image, 80, 20);

                    // Draw grid lines
                    ctx.strokeStyle = 'rgba(255, 0, 0, 0.5)';  // Red with 50% opacity
                    ctx.lineWidth = 1;

                    // Vertical grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.beginPath();
                        ctx.moveTo(x, 20);
                        ctx.lineTo(x, image.height + 20);
                        ctx.stroke();
                    }

                    // Horizontal grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + (1000 - i) / 1000 * image.height;
                        ctx.beginPath();
                        ctx.moveTo(80, y);
                        ctx.lineTo(image.width + 80, y);
                        ctx.stroke();
                    }

                    // Draw bounding boxes
                    const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
                    coordinates.forEach((box, index) => {
                        const [ymin, xmin, ymax, xmax] = box.map(coord => coord / 1000);
                        
                        const width = (xmax - xmin) * image.width;
                        const height = (ymax - ymin) * image.height;

                        ctx.strokeStyle = colors[index % colors.length];
                        ctx.lineWidth = 5;
                        ctx.strokeRect(xmin * image.width + 80, ymin * image.height + 20, width, height);

                        // Extract and display bounding box image
                        const boundingBoxCanvas = document.createElement('canvas');
                        boundingBoxCanvas.width = width;
                        boundingBoxCanvas.height = height;
                        const bbCtx = boundingBoxCanvas.getContext('2d');
                        bbCtx.drawImage(image, xmin * image.width, ymin * image.height, width, height, 0, 0, width, height);

                        const boundingBoxImages = document.getElementById('boundingBoxImages');
                        const title = document.createElement('p');
                        title.textContent = `Bounding Box: [${box.join(', ')}]`;
                        boundingBoxImages.appendChild(title);
                        boundingBoxImages.appendChild(boundingBoxCanvas);
                    });

                    // Draw axes and labels
                    ctx.strokeStyle = '#000000';
                    ctx.lineWidth = 1;
                    ctx.font = '26px Arial';
                    ctx.textAlign = 'right';

                    // Y-axis
                    ctx.beginPath();
                    ctx.moveTo(80, 20);
                    ctx.lineTo(80, image.height + 20);
                    ctx.stroke();

                    // Y-axis labels and ticks
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + i / 1000 * image.height;
                        ctx.fillText(i.toString(), 75, y + 5);
                        ctx.beginPath();
                        ctx.moveTo(75, y);
                        ctx.lineTo(80, y);
                        ctx.stroke();
                    }

                    // X-axis
                    ctx.beginPath();
                    ctx.moveTo(80, image.height + 20);
                    ctx.lineTo(image.width + 80, image.height + 20);
                    ctx.stroke();

                    // X-axis labels and ticks
                    ctx.textAlign = 'center';
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.fillText(i.toString(), x, image.height + 40);
                        ctx.beginPath();
                        ctx.moveTo(x, image.height + 20);
                        ctx.lineTo(x, image.height + 25);
                        ctx.stroke();
                    }
                };
                image.src = event.target.result;
            };
            reader.readAsDataURL(file);
        }

        function clearImage() {
            document.getElementById('imageInput').value = '';
            document.getElementById('canvas').getContext('2d').clearRect(0, 0, canvas.width, canvas.height);
            document.getElementById('imagePreview').src = '';
            document.getElementById('imagePreview').style.display = 'none';
            document.getElementById('boundingBoxImages').innerHTML = '';
        }

        function previewImage(event) {
            const file = event.target.files[0];
            if (file) {
                const reader = new FileReader();
                reader.onload = function(e) {
                    const img = document.getElementById('imagePreview');
                    img.src = e.target.result;
                    img.style.display = 'block';
                }
                reader.readAsDataURL(file);
            }
        }

        // Attach event listeners
        document.getElementById('submitBtn').addEventListener('click', processImageAndPrompt);
        document.getElementById('clearImageBtn').addEventListener('click', clearImage);
        document.getElementById('imageInput').addEventListener('change', previewImage);
    </script>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        textarea {
            width: 100%;
            height: 100px;
        }
        #result, #imageContainer {
            margin-top: 20px;
            border: 1px solid #ccc;
            padding: 10px;
        }
        #canvas {
            max-width: 100%;
            height: auto;
        }
        #imagePreview {
            max-width: 100%;
            display: none;
            margin-top: 10px;
        }
        #boundingBoxImages {
            display: flex;
            flex-wrap: wrap;
            gap: 10px;
            margin-top: 20px;
        }
        #boundingBoxImages canvas {
            max-width: 200px;
            height: auto;
        }
    </style>
</head>
<body>
    <h1>Gemini API Image Bounding Box Visualization</h1>
    <select id="modelSelect">
        <option value="gemini-1.5-pro-latest">gemini-1.5-pro</option>
        <option value="gemini-1.5-flash-latest">gemini-1.5-flash</option>
        <option value="gemini-1.5-flash-8b-latest">gemini-1.5-flash-8b</option>
    </select>
    <input type="file" id="imageInput" accept="image/*">
    <button id="clearImageBtn">Clear Image</button>
    <textarea id="promptInput">Return bounding boxes as JSON arrays [ymin, xmin, ymax, xmax]
</textarea>
    <button id="submitBtn">Process</button>
    <div id="result"></div>
    <div id="imageContainer">
        <img id="imagePreview" alt="Image preview" />
        <canvas id="canvas"></canvas>
    </div>
    <div id="boundingBoxImages"></div>
</body>
</html>

Here are the main changes I've made to the code:

  1. Modified the processImageAndPrompt function to clear previous bounding box images when processing a new prompt:

    document.getElementById('boundingBoxImages').innerHTML = '';
  2. Updated the displayImageWithBoundingBoxes function to extract and display individual bounding box images:

    // Extract and display bounding box image
    const boundingBoxCanvas = document.createElement('canvas');
    boundingBoxCanvas.width = width;
    boundingBoxCanvas.height = height;
    const bbCtx = boundingBoxCanvas.getContext('2d');
    bbCtx.drawImage(image, xmin * image.width, ymin * image.height, width, height, 0, 0, width, height);
    
    const boundingBoxImages = document.getElementById('boundingBoxImages');
    const title = document.createElement('p');
    title.textContent = `Bounding Box: [${box.join(', ')}]`;
    boundingBoxImages.appendChild(title);
    boundingBoxImages.appendChild(boundingBoxCanvas);
  3. Modified the clearImage function to also clear the bounding box images:

    document.getElementById('boundingBoxImages').innerHTML = '';
  4. Added CSS styles for the bounding box images:

    #boundingBoxImages {
        display: flex;
        flex-wrap: wrap;
        gap: 10px;
        margin-top: 20px;
    }
    #boundingBoxImages canvas {
        max-width: 200px;
        height: auto;
    }
  5. Added a new div to the HTML body to contain the bounding box images:

    <div id="boundingBoxImages"></div>

These changes implement the requested functionality:

  • The "Clear Image" button now removes both the file input value and the image preview.
  • When processing a new prompt, any previously displayed bounding box images are removed.
  • Each bounding box is now extracted as a separate image and displayed below the main image, preceded by a title showing its coordinates.
  • The bounding box images are cleared when a new prompt is executed or when the "Clear Image" button is clicked.

The tool now provides a more comprehensive visualization of the bounding boxes, allowing users to see both the overall image with bounding boxes and the individual extracted regions.

human (Oct 3, 2024, 03:20 PM)

Make sure the bounding box images are each display block and are scaled to the correct width and height (after resolving the 0-1000 scale differences)

assistant (Oct 3, 2024, 03:20 PM)

Certainly. I'll modify the code to ensure the bounding box images are displayed as block elements and are scaled correctly based on the 0-1000 scale differences. Here's the updated version of the HTML file:

Modified Gemini API Image Bounding Box Visualization Tool

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini API Image Bounding Box Visualization</title>
    <script type="module">
        import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
        import { marked } from "https://esm.run/marked";

        // ... (previous functions remain unchanged)

        function displayImageWithBoundingBoxes(file, coordinates) {
            const reader = new FileReader();
            reader.onload = function(event) {
                const image = new Image();
                image.onload = function() {
                    const canvas = document.getElementById('canvas');
                    canvas.width = image.width + 100;
                    canvas.height = image.height + 100;
                    const ctx = canvas.getContext('2d');
                    
                    // Draw the image
                    ctx.drawImage(image, 80, 20);

                    // Draw grid lines and axes (unchanged)
                    // ...

                    // Draw bounding boxes
                    const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
                    const boundingBoxImages = document.getElementById('boundingBoxImages');
                    boundingBoxImages.innerHTML = ''; // Clear previous bounding box images

                    coordinates.forEach((box, index) => {
                        const [ymin, xmin, ymax, xmax] = box.map(coord => coord / 1000);
                        
                        const width = (xmax - xmin) * image.width;
                        const height = (ymax - ymin) * image.height;

                        ctx.strokeStyle = colors[index % colors.length];
                        ctx.lineWidth = 5;
                        ctx.strokeRect(xmin * image.width + 80, ymin * image.height + 20, width, height);

                        // Extract and display bounding box image
                        const boundingBoxCanvas = document.createElement('canvas');
                        boundingBoxCanvas.width = width;
                        boundingBoxCanvas.height = height;
                        const bbCtx = boundingBoxCanvas.getContext('2d');
                        bbCtx.drawImage(image, xmin * image.width, ymin * image.height, width, height, 0, 0, width, height);

                        const boundingBoxContainer = document.createElement('div');
                        boundingBoxContainer.className = 'bounding-box-container';

                        const title = document.createElement('p');
                        title.textContent = `Bounding Box: [${box.join(', ')}]`;
                        boundingBoxContainer.appendChild(title);

                        const scaledWidth = Math.round(width);
                        const scaledHeight = Math.round(height);
                        boundingBoxCanvas.style.width = `${scaledWidth}px`;
                        boundingBoxCanvas.style.height = `${scaledHeight}px`;
                        boundingBoxContainer.appendChild(boundingBoxCanvas);

                        boundingBoxImages.appendChild(boundingBoxContainer);
                    });

                    // Draw axes and labels (unchanged)
                    // ...
                };
                image.src = event.target.result;
            };
            reader.readAsDataURL(file);
        }

        // ... (remaining functions and event listeners remain unchanged)

    </script>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        textarea {
            width: 100%;
            height: 100px;
        }
        #result, #imageContainer {
            margin-top: 20px;
            border: 1px solid #ccc;
            padding: 10px;
        }
        #canvas {
            max-width: 100%;
            height: auto;
        }
        #imagePreview {
            max-width: 100%;
            display: none;
            margin-top: 10px;
        }
        #boundingBoxImages {
            display: flex;
            flex-wrap: wrap;
            gap: 20px;
            margin-top: 20px;
        }
        .bounding-box-container {
            display: block;
            margin-bottom: 20px;
        }
        .bounding-box-container p {
            margin: 0 0 5px 0;
            font-weight: bold;
        }
        .bounding-box-container canvas {
            display: block;
            border: 1px solid #ccc;
        }
    </style>
</head>
<body>
    <h1>Gemini API Image Bounding Box Visualization</h1>
    <select id="modelSelect">
        <option value="gemini-1.5-pro-latest">gemini-1.5-pro</option>
        <option value="gemini-1.5-flash-latest">gemini-1.5-flash</option>
        <option value="gemini-1.5-flash-8b-latest">gemini-1.5-flash-8b</option>
    </select>
    <input type="file" id="imageInput" accept="image/*">
    <button id="clearImageBtn">Clear Image</button>
    <textarea id="promptInput">Return bounding boxes as JSON arrays [ymin, xmin, ymax, xmax]
</textarea>
    <button id="submitBtn">Process</button>
    <div id="result"></div>
    <div id="imageContainer">
        <img id="imagePreview" alt="Image preview" />
        <canvas id="canvas"></canvas>
    </div>
    <div id="boundingBoxImages"></div>
</body>
</html>

Here are the main changes I've made to address your requirements:

  1. Modified the displayImageWithBoundingBoxes function to scale the bounding box images correctly:
const scaledWidth = Math.round(width);
const scaledHeight = Math.round(height);
boundingBoxCanvas.style.width = `${scaledWidth}px`;
boundingBoxCanvas.style.height = `${scaledHeight}px`;

This ensures that the bounding box images are displayed at their correct size relative to the original image.

  1. Created a container for each bounding box image to group the title and the image:
const boundingBoxContainer = document.createElement('div');
boundingBoxContainer.className = 'bounding-box-container';

const title = document.createElement('p');
title.textContent = `Bounding Box: [${box.join(', ')}]`;
boundingBoxContainer.appendChild(title);

boundingBoxContainer.appendChild(boundingBoxCanvas);

boundingBoxImages.appendChild(boundingBoxContainer);
  1. Updated the CSS to ensure proper display of the bounding box images:
#boundingBoxImages {
    display: flex;
    flex-wrap: wrap;
    gap: 20px;
    margin-top: 20px;
}
.bounding-box-container {
    display: block;
    margin-bottom: 20px;
}
.bounding-box-container p {
    margin: 0 0 5px 0;
    font-weight: bold;
}
.bounding-box-container canvas {
    display: block;
    border: 1px solid #ccc;
}

These CSS changes ensure that:

  • Each bounding box container is displayed as a block element.
  • The title (coordinates) is displayed above the image.
  • The canvas (bounding box image) is displayed as a block element.
  • There's proper spacing between bounding box images.

With these modifications, the bounding box images will now be displayed as block elements and scaled correctly based on their actual dimensions from the original image. The coordinates are shown above each bounding box image, and the layout should be more organized and easier to read.

human (Oct 3, 2024, 03:21 PM)

the bounding box images should not be shown as canvas elements - they should be JPEG images using a base64 data URI

assistant (Oct 3, 2024, 03:21 PM)

I apologize for the misunderstanding. You're right, we should convert the canvas elements to JPEG images using base64 data URIs. I'll modify the code to implement this change. Here's the updated version:

Modified Gemini API Image Bounding Box Visualization Tool

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini API Image Bounding Box Visualization</title>
    <script type="module">
        import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
        import { marked } from "https://esm.run/marked";

        // ... (previous functions remain unchanged)

        function displayImageWithBoundingBoxes(file, coordinates) {
            const reader = new FileReader();
            reader.onload = function(event) {
                const image = new Image();
                image.onload = function() {
                    const canvas = document.getElementById('canvas');
                    canvas.width = image.width + 100;
                    canvas.height = image.height + 100;
                    const ctx = canvas.getContext('2d');
                    
                    // Draw the image
                    ctx.drawImage(image, 80, 20);

                    // Draw grid lines and axes (unchanged)
                    // ...

                    // Draw bounding boxes
                    const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
                    const boundingBoxImages = document.getElementById('boundingBoxImages');
                    boundingBoxImages.innerHTML = ''; // Clear previous bounding box images

                    coordinates.forEach((box, index) => {
                        const [ymin, xmin, ymax, xmax] = box.map(coord => coord / 1000);
                        
                        const width = (xmax - xmin) * image.width;
                        const height = (ymax - ymin) * image.height;

                        ctx.strokeStyle = colors[index % colors.length];
                        ctx.lineWidth = 5;
                        ctx.strokeRect(xmin * image.width + 80, ymin * image.height + 20, width, height);

                        // Extract bounding box image
                        const boundingBoxCanvas = document.createElement('canvas');
                        boundingBoxCanvas.width = width;
                        boundingBoxCanvas.height = height;
                        const bbCtx = boundingBoxCanvas.getContext('2d');
                        bbCtx.drawImage(image, xmin * image.width, ymin * image.height, width, height, 0, 0, width, height);

                        // Convert canvas to JPEG base64 data URI
                        const dataURI = boundingBoxCanvas.toDataURL('image/jpeg');

                        const boundingBoxContainer = document.createElement('div');
                        boundingBoxContainer.className = 'bounding-box-container';

                        const title = document.createElement('p');
                        title.textContent = `Bounding Box: [${box.join(', ')}]`;
                        boundingBoxContainer.appendChild(title);

                        const img = document.createElement('img');
                        img.src = dataURI;
                        img.style.width = `${Math.round(width)}px`;
                        img.style.height = `${Math.round(height)}px`;
                        boundingBoxContainer.appendChild(img);

                        boundingBoxImages.appendChild(boundingBoxContainer);
                    });

                    // Draw axes and labels (unchanged)
                    // ...
                };
                image.src = event.target.result;
            };
            reader.readAsDataURL(file);
        }

        // ... (remaining functions and event listeners remain unchanged)

    </script>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        textarea {
            width: 100%;
            height: 100px;
        }
        #result, #imageContainer {
            margin-top: 20px;
            border: 1px solid #ccc;
            padding: 10px;
        }
        #canvas {
            max-width: 100%;
            height: auto;
        }
        #imagePreview {
            max-width: 100%;
            display: none;
            margin-top: 10px;
        }
        #boundingBoxImages {
            display: flex;
            flex-wrap: wrap;
            gap: 20px;
            margin-top: 20px;
        }
        .bounding-box-container {
            display: block;
            margin-bottom: 20px;
        }
        .bounding-box-container p {
            margin: 0 0 5px 0;
            font-weight: bold;
        }
        .bounding-box-container img {
            display: block;
            border: 1px solid #ccc;
        }
    </style>
</head>
<body>
    <h1>Gemini API Image Bounding Box Visualization</h1>
    <select id="modelSelect">
        <option value="gemini-1.5-pro-latest">gemini-1.5-pro</option>
        <option value="gemini-1.5-flash-latest">gemini-1.5-flash</option>
        <option value="gemini-1.5-flash-8b-latest">gemini-1.5-flash-8b</option>
    </select>
    <input type="file" id="imageInput" accept="image/*">
    <button id="clearImageBtn">Clear Image</button>
    <textarea id="promptInput">Return bounding boxes as JSON arrays [ymin, xmin, ymax, xmax]
</textarea>
    <button id="submitBtn">Process</button>
    <div id="result"></div>
    <div id="imageContainer">
        <img id="imagePreview" alt="Image preview" />
        <canvas id="canvas"></canvas>
    </div>
    <div id="boundingBoxImages"></div>
</body>
</html>

Here are the main changes I've made to address your requirements:

  1. Modified the displayImageWithBoundingBoxes function to convert the canvas to a JPEG base64 data URI:
// Convert canvas to JPEG base64 data URI
const dataURI = boundingBoxCanvas.toDataURL('image/jpeg');

const boundingBoxContainer = document.createElement('div');
boundingBoxContainer.className = 'bounding-box-container';

const title = document.createElement('p');
title.textContent = `Bounding Box: [${box.join(', ')}]`;
boundingBoxContainer.appendChild(title);

const img = document.createElement('img');
img.src = dataURI;
img.style.width = `${Math.round(width)}px`;
img.style.height = `${Math.round(height)}px`;
boundingBoxContainer.appendChild(img);

boundingBoxImages.appendChild(boundingBoxContainer);

This change ensures that:

  • The bounding box image is converted to a JPEG format using a base64 data URI.
  • An <img> element is created with the data URI as its source, replacing the previous canvas element.
  • The image is sized correctly based on the bounding box dimensions.
  1. Updated the CSS to style the new <img> elements:
.bounding-box-container img {
    display: block;
    border: 1px solid #ccc;
}

These modifications ensure that:

  • The bounding box images are now displayed as JPEG images using base64 data URIs.
  • Each image is displayed as a block element with the correct dimensions.
  • The layout remains organized and easy to read.

With these changes, the tool now displays the bounding box images as JPEG images using base64 data URIs, while maintaining the correct scaling and layout. The main image with bounding boxes drawn on it is still displayed using a canvas element, but the individual bounding box images are now proper JPEG images.

human (Oct 3, 2024, 03:22 PM)

I need you to assemble the final HTML and all JavaScript for me

assistant (Oct 3, 2024, 03:22 PM)

Certainly! I'll assemble the complete HTML file with all the JavaScript included. This will be the final, fully functional version of the Gemini API Image Bounding Box Visualization tool.

Final Gemini API Image Bounding Box Visualization Tool

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini API Image Bounding Box Visualization</title>
    <script type="module">
        import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
        import { marked } from "https://esm.run/marked";

        function getApiKey() {
            let apiKey = localStorage.getItem("GEMINI_API_KEY");
            if (!apiKey) {
                apiKey = prompt("Please enter your Gemini API key:");
                if (apiKey) {
                    localStorage.setItem("GEMINI_API_KEY", apiKey);
                }
            }
            return apiKey;
        }

        async function getGenerativeModel(params) {
            const API_KEY = getApiKey();
            const genAI = new GoogleGenerativeAI(API_KEY);
            return genAI.getGenerativeModel(params);
        }

        async function fileToGenerativePart(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onloadend = () => resolve({
                    inlineData: { 
                        data: reader.result.split(",")[1],
                        mimeType: file.type 
                    }
                });
                reader.readAsDataURL(file);
            });
        }

        function resizeAndCompressImage(file) {
            return new Promise((resolve) => {
                const reader = new FileReader();
                reader.onload = function(event) {
                    const img = new Image();
                    img.onload = function() {
                        const canvas = document.createElement('canvas');
                        const ctx = canvas.getContext('2d');

                        let width = img.width;
                        let height = img.height;

                        if (width > 1000) {
                            height = Math.round((height * 1000) / width);
                            width = 1000;
                        }

                        canvas.width = width;
                        canvas.height = height;

                        ctx.drawImage(img, 0, 0, width, height);

                        canvas.toBlob((blob) => {
                            resolve(new File([blob], "compressed_image.jpg", { type: "image/jpeg" }));
                        }, 'image/jpeg', 0.7);
                    };
                    img.src = event.target.result;
                };
                reader.readAsDataURL(file);
            });
        }

        async function processImageAndPrompt() {
            const fileInput = document.getElementById('imageInput');
            const promptInput = document.getElementById('promptInput');
            const resultDiv = document.getElementById('result');
            const modelSelect = document.getElementById('modelSelect');
            
            if (!promptInput.value) {
                alert('Please enter a prompt.');
                return;
            }

            resultDiv.innerHTML = 'Processing...';
            
            // Clear previous bounding box images
            document.getElementById('boundingBoxImages').innerHTML = '';
            
            try {
                const model = await getGenerativeModel({ model: modelSelect.value });
                let content = [promptInput.value];

                if (fileInput.files[0]) {
                    const compressedImage = await resizeAndCompressImage(fileInput.files[0]);
                    const imagePart = await fileToGenerativePart(compressedImage);
                    content.push(imagePart);
                }
                
                const result = await model.generateContent(content);
                const response = await result.response;
                const text = response.text();
                
                resultDiv.innerHTML = marked.parse(text);

                if (fileInput.files[0]) {
                    // Extract coordinates from the response
                    const coordinates = extractCoordinates(text);
                    if (coordinates.length > 0) {
                        displayImageWithBoundingBoxes(fileInput.files[0], coordinates);
                    }
                }
            } catch (error) {
                resultDiv.innerHTML = `Error: ${error.message}`;
            }
        }

        function extractCoordinates(text) {
            const regex = /\[\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\]/g;
            const matches = text.match(regex) || [];
            return matches.map(JSON.parse);
        }

        function displayImageWithBoundingBoxes(file, coordinates) {
            const reader = new FileReader();
            reader.onload = function(event) {
                const image = new Image();
                image.onload = function() {
                    const canvas = document.getElementById('canvas');
                    canvas.width = image.width + 100;
                    canvas.height = image.height + 100;
                    const ctx = canvas.getContext('2d');
                    
                    // Draw the image
                    ctx.drawImage(image, 80, 20);

                    // Draw grid lines
                    ctx.strokeStyle = 'rgba(255, 0, 0, 0.5)';  // Red with 50% opacity
                    ctx.lineWidth = 1;

                    // Vertical grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.beginPath();
                        ctx.moveTo(x, 20);
                        ctx.lineTo(x, image.height + 20);
                        ctx.stroke();
                    }

                    // Horizontal grid lines
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + (1000 - i) / 1000 * image.height;
                        ctx.beginPath();
                        ctx.moveTo(80, y);
                        ctx.lineTo(image.width + 80, y);
                        ctx.stroke();
                    }

                    // Draw bounding boxes
                    const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
                    const boundingBoxImages = document.getElementById('boundingBoxImages');
                    boundingBoxImages.innerHTML = ''; // Clear previous bounding box images

                    coordinates.forEach((box, index) => {
                        const [ymin, xmin, ymax, xmax] = box.map(coord => coord / 1000);
                        
                        const width = (xmax - xmin) * image.width;
                        const height = (ymax - ymin) * image.height;

                        ctx.strokeStyle = colors[index % colors.length];
                        ctx.lineWidth = 5;
                        ctx.strokeRect(xmin * image.width + 80, ymin * image.height + 20, width, height);

                        // Extract bounding box image
                        const boundingBoxCanvas = document.createElement('canvas');
                        boundingBoxCanvas.width = width;
                        boundingBoxCanvas.height = height;
                        const bbCtx = boundingBoxCanvas.getContext('2d');
                        bbCtx.drawImage(image, xmin * image.width, ymin * image.height, width, height, 0, 0, width, height);

                        // Convert canvas to JPEG base64 data URI
                        const dataURI = boundingBoxCanvas.toDataURL('image/jpeg');

                        const boundingBoxContainer = document.createElement('div');
                        boundingBoxContainer.className = 'bounding-box-container';

                        const title = document.createElement('p');
                        title.textContent = `Bounding Box: [${box.join(', ')}]`;
                        boundingBoxContainer.appendChild(title);

                        const img = document.createElement('img');
                        img.src = dataURI;
                        img.style.width = `${Math.round(width)}px`;
                        img.style.height = `${Math.round(height)}px`;
                        boundingBoxContainer.appendChild(img);

                        boundingBoxImages.appendChild(boundingBoxContainer);
                    });

                    // Draw axes and labels
                    ctx.strokeStyle = '#000000';
                    ctx.lineWidth = 1;
                    ctx.font = '26px Arial';
                    ctx.textAlign = 'right';

                    // Y-axis
                    ctx.beginPath();
                    ctx.moveTo(80, 20);
                    ctx.lineTo(80, image.height + 20);
                    ctx.stroke();

                    // Y-axis labels and ticks
                    for (let i = 0; i <= 1000; i += 100) {
                        const y = 20 + i / 1000 * image.height;
                        ctx.fillText(i.toString(), 75, y + 5);
                        ctx.beginPath();
                        ctx.moveTo(75, y);
                        ctx.lineTo(80, y);
                        ctx.stroke();
                    }

                    // X-axis
                    ctx.beginPath();
                    ctx.moveTo(80, image.height + 20);
                    ctx.lineTo(image.width + 80, image.height + 20);
                    ctx.stroke();

                    // X-axis labels and ticks
                    ctx.textAlign = 'center';
                    for (let i = 0; i <= 1000; i += 100) {
                        const x = 80 + i / 1000 * image.width;
                        ctx.fillText(i.toString(), x, image.height + 40);
                        ctx.beginPath();
                        ctx.moveTo(x, image.height + 20);
                        ctx.lineTo(x, image.height + 25);
                        ctx.stroke();
                    }
                };
                image.src = event.target.result;
            };
            reader.readAsDataURL(file);
        }

        function clearImage() {
            document.getElementById('imageInput').value = '';
            document.getElementById('canvas').getContext('2d').clearRect(0, 0, canvas.width, canvas.height);
            document.getElementById('imagePreview').src = '';
            document.getElementById('imagePreview').style.display = 'none';
            document.getElementById('boundingBoxImages').innerHTML = '';
        }

        function previewImage(event) {
            const file = event.target.files[0];
            if (file) {
                const reader = new FileReader();
                reader.onload = function(e) {
                    const img = document.getElementById('imagePreview');
                    img.src = e.target.result;
                    img.style.display = 'block';
                }
                reader.readAsDataURL(file);
            }
        }

        // Attach event listeners
        document.getElementById('submitBtn').addEventListener('click', processImageAndPrompt);
        document.getElementById('clearImageBtn').addEventListener('click', clearImage);
        document.getElementById('imageInput').addEventListener('change', previewImage);
    </script>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        textarea {
            width: 100%;
            height: 100px;
        }
        #result, #imageContainer {
            margin-top: 20px;
            border: 1px solid #ccc;
            padding: 10px;
        }
        #canvas {
            max-width: 100%;
            height: auto;
        }
        #imagePreview {
            max-width: 100%;
            display: none;
            margin-top: 10px;
        }
        #boundingBoxImages {
            display: flex;
            flex-wrap: wrap;
            gap: 20px;
            margin-top: 20px;
        }
        .bounding-box-container {
            display: block;
            margin-bottom: 20px;
        }
        .bounding-box-container p {
            margin: 0 0 5px 0;
            font-weight: bold;
        }
        .bounding-box-container img {
            display: block;
            border: 1px solid #ccc;
        }
    </style>
</head>
<body>
    <h1>Gemini API Image Bounding Box Visualization</h1>
    <select id="modelSelect">
        <option value="gemini-1.5-pro-latest">gemini-1.5-pro</option>
        <option value="gemini-1.5-flash-latest">gemini-1.5-flash</option>
        <option value="gemini-1.5-flash-8b-latest">gemini-1.5-flash-8b</option>
    </select>
    <input type="file" id="imageInput" accept="image/*">
    <button id="clearImageBtn">Clear Image</button>
    <textarea id="promptInput">Return bounding boxes as JSON arrays [ymin, xmin, ymax, xmax]
</textarea>
    <button id="submitBtn">Process</button>
    <div id="result"></div>
    <div id="imageContainer">
        <img id="imagePreview" alt="Image preview" />
        <canvas id="canvas"></canvas>
    </div>
    <div id="boundingBoxImages"></div>
</body>
</html>

This HTML file contains all the necessary JavaScript code and styling to run the Gemini API Image Bounding Box Visualization tool. Here's a summary of its main features:

  1. It uses the Gemini API to process images and generate bounding boxes.
  2. Users can select different Gemini models (gemini-1.5-pro, gemini-1.5-flash, gemini-1.5-flash-8b).
  3. It allows users to upload an image and enter a prompt.
  4. The tool processes the image and prompt, displaying the results and any bounding boxes.
  5. Bounding boxes are drawn on the main image using a canvas element.
  6. Individual bounding box images are extracted and displayed as JPEG images using base64 data URIs.
  7. The tool includes a "Clear Image" button to reset the interface.
  8. It provides an image preview when a file is selected.

To use this tool, you would need to:

  1. Save this HTML content to a file (e.g., gemini_bounding_box_tool.html).
  2. Open the file in a web browser.
  3. Enter your Gemini API key when prompted (it will be stored in localStorage for future use).
  4. Upload an image, enter a prompt, and click "Process" to see the results.

Remember that you'll need a valid Gemini API key to use this tool, and you should comply with Google's terms of service and usage policies when using the Gemini API.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment