Skip to content

Instantly share code, notes, and snippets.

@synacktraa
Last active February 5, 2025 07:12
Show Gist options
  • Save synacktraa/50691799698701e6032c0949cbdafeb6 to your computer and use it in GitHub Desktop.
Save synacktraa/50691799698701e6032c0949cbdafeb6 to your computer and use it in GitHub Desktop.
Get select and dom chunk from coords and bboxes
from playwright.sync_api import sync_playwright
from pathlib import Path
import re
from pathlib import Path
from gradio_client import Client, handle_file
from PIL import Image, ImageDraw
OS_ATLAS_BASE_7B_MODEL_ID = "OS-Copilot/OS-Atlas-Base-7B"
def extract_bbox(response: str):
numbers = re.findall(r'-?\d+(?:\.\d+)?', response)
if not numbers:
return None
x1, y1, x2, y2 = map(float, numbers[:4])
return x1, y1, x2, y2
def extract_coordinates(bbox: tuple[float, float, float, float]):
"""
Extracts the coordinates from the given bbox response.
Args:
response: The response from the OS-Atlas API.
Returns:
The coordinates as a tuple of floats or None if the response does not contain a valid bounding box.
"""
x1, y1, x2, y2 = bbox
return int((x1 + x2) / 2), int((y1 + y2) / 2)
class OSAtlasBase7BAPI:
def __init__(self):
import os
self.client = Client("maxiw/OS-ATLAS")
def get_bounding_box(self, query: str, image_file: str | Path):
result = self.client.predict(
image=handle_file(image_file),
text_input=f"{query}\nReturn the response in the form of a bbox",
model_id=OS_ATLAS_BASE_7B_MODEL_ID,
api_name="/run_example",
)
return extract_bbox(result[1])
def get_coordinates(self, query: str, image_file: str | Path):
bbox = self.get_bounding_box(query, image_file)
if bbox is None:
return None
return extract_coordinates(bbox)
BUILD_SELECTOR_FUNCTION = """
const buildSelector = (el) => {
if (!el || el === document.body) return 'body';
let selector = el.tagName.toLowerCase();
if (el.id) return `#${el.id}`;
if (el.classList.length) {
selector += `.${Array.from(el.classList).join('.')}`;
}
const siblings = el.parentNode ? Array.from(el.parentNode.children) : [];
if (siblings.length > 1) {
const index = siblings.indexOf(el) + 1;
selector += `:nth-child(${index})`;
}
const parentSelector = buildSelector(el.parentElement);
return `${parentSelector} > ${selector}`;
}
"""
SELECTOR_FROM_COORDS_FUNCTION = BUILD_SELECTOR_FUNCTION + """
([x, y]) => {
const element = document.elementFromPoint(x, y);
return !element ? null : buildSelector(element);
}
"""
DOM_CHUNK_FROM_BBOX_FUNCTION = BUILD_SELECTOR_FUNCTION + """
([x1, y1, x2, y2, nParents = 0]) => {
const elements = new Set();
const step = 5; // To cover more points within the box
for (let x = x1; x <= x2; x += step) {
for (let y = y1; y <= y2; y += step) {
const foundElements = document.elementsFromPoint(x, y);
foundElements.forEach(el => elements.add(el));
}
}
// Filter to the smallest element (least area)
let smallestElement = null;
let minArea = Infinity;
elements.forEach(el => {
const rect = el.getBoundingClientRect();
const area = rect.width * rect.height;
if (area > 0 && area < minArea) {
minArea = area;
smallestElement = el;
}
});
if (!smallestElement) return [null, null];
// Move up to nParents levels or until the top parent
let currentElement = smallestElement;
for (let i = 0; i < nParents && currentElement.parentElement; i++) {
currentElement = currentElement.parentElement;
}
return [currentElement.outerHTML, buildSelector(currentElement)];
}
"""
def draw_bounding_box(image_path, bbox, output_path=None, box_color=(255, 0, 0), box_width=3):
"""
Draws a bounding box on an image.
Args:
image_path (str): Path to the input image file.
bbox (tuple): Bounding box coordinates in the format (x_min, y_min, x_max, y_max).
output_path (str, optional): Path to save the output image. If None, displays the image.
box_color (tuple, optional): RGB color of the bounding box. Default is red.
box_width (int, optional): Width of the bounding box lines. Default is 3.
"""
image = Image.open(image_path)
draw = ImageDraw.Draw(image)
draw.rectangle(bbox, outline=box_color, width=box_width)
if output_path:
image.save(output_path)
else:
image.show()
# grounding_model = OSAtlasBase7BAPI()
def get_selector_and_dom(url: str, query: str):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page(viewport={"width": 1920, "height": 1080})
page.goto(url)
screenshot_path = Path("screenshot.png")
page.screenshot(path=str(screenshot_path))
# bbox = grounding_model.get_bounding_box(query, screenshot_path)
bbox = (416.0, 413.0, 618.0, 438.0)
draw_bounding_box(screenshot_path, bbox)
if bbox:
x1, y1, x2, y2 = map(round, bbox)
x, y = (x1 + x2) // 2, (y1 + y2) // 2
# x1 = x1 - 0.1 * 1920
# x2 = x2 + 0.1 * 1920
# y1 = y1 - 0.1 * 1080
# y2 = y2 + 0.1 * 1080
draw_bounding_box(screenshot_path, (x1, y1, x2, y2), box_color=(0, 255, 0), box_width=1)
print("Bounding box:", (x1, y1, x2, y2))
print("Coordinates:", (x, y))
selector_path = page.evaluate(SELECTOR_FROM_COORDS_FUNCTION, [x, y])
dom_chunk = page.evaluate(DOM_CHUNK_FROM_BBOX_FUNCTION, [x1, y1, x2, y2, 1])
browser.close()
return selector_path, dom_chunk
url = "https://llmagents-learning.org/sp25"
query = "Prospective Students section"
selector, dom_chunk = get_selector_and_dom(url, query)
print("Selector:", selector)
print("DOM Chunk:", dom_chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment