Last active
February 5, 2025 07:12
-
-
Save synacktraa/50691799698701e6032c0949cbdafeb6 to your computer and use it in GitHub Desktop.
Get select and dom chunk from coords and bboxes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from playwright.sync_api import sync_playwright | |
from pathlib import Path | |
import re | |
from pathlib import Path | |
from gradio_client import Client, handle_file | |
from PIL import Image, ImageDraw | |
OS_ATLAS_BASE_7B_MODEL_ID = "OS-Copilot/OS-Atlas-Base-7B" | |
def extract_bbox(response: str): | |
numbers = re.findall(r'-?\d+(?:\.\d+)?', response) | |
if not numbers: | |
return None | |
x1, y1, x2, y2 = map(float, numbers[:4]) | |
return x1, y1, x2, y2 | |
def extract_coordinates(bbox: tuple[float, float, float, float]): | |
""" | |
Extracts the coordinates from the given bbox response. | |
Args: | |
response: The response from the OS-Atlas API. | |
Returns: | |
The coordinates as a tuple of floats or None if the response does not contain a valid bounding box. | |
""" | |
x1, y1, x2, y2 = bbox | |
return int((x1 + x2) / 2), int((y1 + y2) / 2) | |
class OSAtlasBase7BAPI: | |
def __init__(self): | |
import os | |
self.client = Client("maxiw/OS-ATLAS") | |
def get_bounding_box(self, query: str, image_file: str | Path): | |
result = self.client.predict( | |
image=handle_file(image_file), | |
text_input=f"{query}\nReturn the response in the form of a bbox", | |
model_id=OS_ATLAS_BASE_7B_MODEL_ID, | |
api_name="/run_example", | |
) | |
return extract_bbox(result[1]) | |
def get_coordinates(self, query: str, image_file: str | Path): | |
bbox = self.get_bounding_box(query, image_file) | |
if bbox is None: | |
return None | |
return extract_coordinates(bbox) | |
BUILD_SELECTOR_FUNCTION = """ | |
const buildSelector = (el) => { | |
if (!el || el === document.body) return 'body'; | |
let selector = el.tagName.toLowerCase(); | |
if (el.id) return `#${el.id}`; | |
if (el.classList.length) { | |
selector += `.${Array.from(el.classList).join('.')}`; | |
} | |
const siblings = el.parentNode ? Array.from(el.parentNode.children) : []; | |
if (siblings.length > 1) { | |
const index = siblings.indexOf(el) + 1; | |
selector += `:nth-child(${index})`; | |
} | |
const parentSelector = buildSelector(el.parentElement); | |
return `${parentSelector} > ${selector}`; | |
} | |
""" | |
SELECTOR_FROM_COORDS_FUNCTION = BUILD_SELECTOR_FUNCTION + """ | |
([x, y]) => { | |
const element = document.elementFromPoint(x, y); | |
return !element ? null : buildSelector(element); | |
} | |
""" | |
DOM_CHUNK_FROM_BBOX_FUNCTION = BUILD_SELECTOR_FUNCTION + """ | |
([x1, y1, x2, y2, nParents = 0]) => { | |
const elements = new Set(); | |
const step = 5; // To cover more points within the box | |
for (let x = x1; x <= x2; x += step) { | |
for (let y = y1; y <= y2; y += step) { | |
const foundElements = document.elementsFromPoint(x, y); | |
foundElements.forEach(el => elements.add(el)); | |
} | |
} | |
// Filter to the smallest element (least area) | |
let smallestElement = null; | |
let minArea = Infinity; | |
elements.forEach(el => { | |
const rect = el.getBoundingClientRect(); | |
const area = rect.width * rect.height; | |
if (area > 0 && area < minArea) { | |
minArea = area; | |
smallestElement = el; | |
} | |
}); | |
if (!smallestElement) return [null, null]; | |
// Move up to nParents levels or until the top parent | |
let currentElement = smallestElement; | |
for (let i = 0; i < nParents && currentElement.parentElement; i++) { | |
currentElement = currentElement.parentElement; | |
} | |
return [currentElement.outerHTML, buildSelector(currentElement)]; | |
} | |
""" | |
def draw_bounding_box(image_path, bbox, output_path=None, box_color=(255, 0, 0), box_width=3): | |
""" | |
Draws a bounding box on an image. | |
Args: | |
image_path (str): Path to the input image file. | |
bbox (tuple): Bounding box coordinates in the format (x_min, y_min, x_max, y_max). | |
output_path (str, optional): Path to save the output image. If None, displays the image. | |
box_color (tuple, optional): RGB color of the bounding box. Default is red. | |
box_width (int, optional): Width of the bounding box lines. Default is 3. | |
""" | |
image = Image.open(image_path) | |
draw = ImageDraw.Draw(image) | |
draw.rectangle(bbox, outline=box_color, width=box_width) | |
if output_path: | |
image.save(output_path) | |
else: | |
image.show() | |
# grounding_model = OSAtlasBase7BAPI() | |
def get_selector_and_dom(url: str, query: str): | |
with sync_playwright() as p: | |
browser = p.chromium.launch(headless=True) | |
page = browser.new_page(viewport={"width": 1920, "height": 1080}) | |
page.goto(url) | |
screenshot_path = Path("screenshot.png") | |
page.screenshot(path=str(screenshot_path)) | |
# bbox = grounding_model.get_bounding_box(query, screenshot_path) | |
bbox = (416.0, 413.0, 618.0, 438.0) | |
draw_bounding_box(screenshot_path, bbox) | |
if bbox: | |
x1, y1, x2, y2 = map(round, bbox) | |
x, y = (x1 + x2) // 2, (y1 + y2) // 2 | |
# x1 = x1 - 0.1 * 1920 | |
# x2 = x2 + 0.1 * 1920 | |
# y1 = y1 - 0.1 * 1080 | |
# y2 = y2 + 0.1 * 1080 | |
draw_bounding_box(screenshot_path, (x1, y1, x2, y2), box_color=(0, 255, 0), box_width=1) | |
print("Bounding box:", (x1, y1, x2, y2)) | |
print("Coordinates:", (x, y)) | |
selector_path = page.evaluate(SELECTOR_FROM_COORDS_FUNCTION, [x, y]) | |
dom_chunk = page.evaluate(DOM_CHUNK_FROM_BBOX_FUNCTION, [x1, y1, x2, y2, 1]) | |
browser.close() | |
return selector_path, dom_chunk | |
url = "https://llmagents-learning.org/sp25" | |
query = "Prospective Students section" | |
selector, dom_chunk = get_selector_and_dom(url, query) | |
print("Selector:", selector) | |
print("DOM Chunk:", dom_chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment