synacktraa/locatr.py

## locatr.py
from playwright.sync_api import sync_playwright
from pathlib import Path

import re
from pathlib import Path

from gradio_client import Client, handle_file
from PIL import Image, ImageDraw


OS_ATLAS_BASE_7B_MODEL_ID = "OS-Copilot/OS-Atlas-Base-7B"

def extract_bbox(response: str):
    numbers = re.findall(r'-?\d+(?:\.\d+)?', response)
    if not numbers:
        return None
    x1, y1, x2, y2 = map(float, numbers[:4])
    return x1, y1, x2, y2


def extract_coordinates(bbox: tuple[float, float, float, float]):
    """
    Extracts the coordinates from the given bbox response.

    Args:
        response: The response from the OS-Atlas API.

    Returns:
        The coordinates as a tuple of floats or None if the response does not contain a valid bounding box.
    """

    x1, y1, x2, y2 = bbox
    return int((x1 + x2) / 2), int((y1 + y2) / 2)


class OSAtlasBase7BAPI:

    def __init__(self):
        import os
        self.client = Client("maxiw/OS-ATLAS")

    def get_bounding_box(self, query: str, image_file: str | Path):
        result = self.client.predict(
            image=handle_file(image_file),
            text_input=f"{query}\nReturn the response in the form of a bbox",
            model_id=OS_ATLAS_BASE_7B_MODEL_ID,
            api_name="/run_example",
        )
        return extract_bbox(result[1])

    def get_coordinates(self, query: str, image_file: str | Path):
        bbox = self.get_bounding_box(query, image_file)
        if bbox is None:
            return None
        return extract_coordinates(bbox)


BUILD_SELECTOR_FUNCTION = """
const buildSelector = (el) => {
    if (!el || el === document.body) return 'body';
    let selector = el.tagName.toLowerCase();
    if (el.id) return `#${el.id}`;
    if (el.classList.length) {
        selector += `.${Array.from(el.classList).join('.')}`;
    }
    const siblings = el.parentNode ? Array.from(el.parentNode.children) : [];
    if (siblings.length > 1) {
        const index = siblings.indexOf(el) + 1;
        selector += `:nth-child(${index})`;
    }
    const parentSelector = buildSelector(el.parentElement);
    return `${parentSelector} > ${selector}`;
}

"""

SELECTOR_FROM_COORDS_FUNCTION = BUILD_SELECTOR_FUNCTION + """
([x, y]) => {
    const element = document.elementFromPoint(x, y);
    return !element ? null : buildSelector(element);
}
"""

DOM_CHUNK_FROM_BBOX_FUNCTION = BUILD_SELECTOR_FUNCTION + """
([x1, y1, x2, y2, nParents = 0]) => {
    const elements = new Set();
    const step = 5;  // To cover more points within the box

    for (let x = x1; x <= x2; x += step) {
        for (let y = y1; y <= y2; y += step) {
            const foundElements = document.elementsFromPoint(x, y);
            foundElements.forEach(el => elements.add(el));
        }
    }

    // Filter to the smallest element (least area)
    let smallestElement = null;
    let minArea = Infinity;

    elements.forEach(el => {
        const rect = el.getBoundingClientRect();
        const area = rect.width * rect.height;
        if (area > 0 && area < minArea) {
            minArea = area;
            smallestElement = el;
        }
    });

    if (!smallestElement) return [null, null];

    // Move up to nParents levels or until the top parent
    let currentElement = smallestElement;
    for (let i = 0; i < nParents && currentElement.parentElement; i++) {
        currentElement = currentElement.parentElement;
    }

    return [currentElement.outerHTML, buildSelector(currentElement)];
}
"""


def draw_bounding_box(image_path, bbox, output_path=None, box_color=(255, 0, 0), box_width=3):
    """
    Draws a bounding box on an image.

    Args:
        image_path (str): Path to the input image file.
        bbox (tuple): Bounding box coordinates in the format (x_min, y_min, x_max, y_max).
        output_path (str, optional): Path to save the output image. If None, displays the image.
        box_color (tuple, optional): RGB color of the bounding box. Default is red.
        box_width (int, optional): Width of the bounding box lines. Default is 3.
    """
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)
    draw.rectangle(bbox, outline=box_color, width=box_width)

    if output_path:
        image.save(output_path)
    else:
        image.show()

# grounding_model = OSAtlasBase7BAPI()

def get_selector_and_dom(url: str, query: str):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page(viewport={"width": 1920, "height": 1080})
        page.goto(url)

        screenshot_path = Path("screenshot.png")
        page.screenshot(path=str(screenshot_path))

        # bbox = grounding_model.get_bounding_box(query, screenshot_path)
        bbox = (416.0, 413.0, 618.0, 438.0)
        draw_bounding_box(screenshot_path, bbox)
        if bbox:
            x1, y1, x2, y2 = map(round, bbox)
            x, y = (x1 + x2) // 2, (y1 + y2) // 2

            # x1 = x1 - 0.1 * 1920
            # x2 = x2 + 0.1 * 1920
            # y1 = y1 - 0.1 * 1080
            # y2 = y2 + 0.1 * 1080
            draw_bounding_box(screenshot_path, (x1, y1, x2, y2), box_color=(0, 255, 0), box_width=1)
            print("Bounding box:", (x1, y1, x2, y2))
            print("Coordinates:", (x, y))

            selector_path = page.evaluate(SELECTOR_FROM_COORDS_FUNCTION, [x, y])
            dom_chunk = page.evaluate(DOM_CHUNK_FROM_BBOX_FUNCTION, [x1, y1, x2, y2, 1])

            browser.close()

            return selector_path, dom_chunk


url = "https://llmagents-learning.org/sp25"
query = "Prospective Students section"
selector, dom_chunk = get_selector_and_dom(url, query)

print("Selector:", selector)
print("DOM Chunk:", dom_chunk)
	from playwright.sync_api import sync_playwright
	from pathlib import Path

	import re
	from pathlib import Path

	from gradio_client import Client, handle_file
	from PIL import Image, ImageDraw


	OS_ATLAS_BASE_7B_MODEL_ID = "OS-Copilot/OS-Atlas-Base-7B"

	def extract_bbox(response: str):
	numbers = re.findall(r'-?\d+(?:\.\d+)?', response)
	if not numbers:
	return None
	x1, y1, x2, y2 = map(float, numbers[:4])
	return x1, y1, x2, y2


	def extract_coordinates(bbox: tuple[float, float, float, float]):
	"""
	Extracts the coordinates from the given bbox response.

	Args:
	response: The response from the OS-Atlas API.

	Returns:
	The coordinates as a tuple of floats or None if the response does not contain a valid bounding box.
	"""

	x1, y1, x2, y2 = bbox
	return int((x1 + x2) / 2), int((y1 + y2) / 2)


	class OSAtlasBase7BAPI:

	def __init__(self):
	import os
	self.client = Client("maxiw/OS-ATLAS")

	def get_bounding_box(self, query: str, image_file: str \| Path):
	result = self.client.predict(
	image=handle_file(image_file),
	text_input=f"{query}\nReturn the response in the form of a bbox",
	model_id=OS_ATLAS_BASE_7B_MODEL_ID,
	api_name="/run_example",
	)
	return extract_bbox(result[1])

	def get_coordinates(self, query: str, image_file: str \| Path):
	bbox = self.get_bounding_box(query, image_file)
	if bbox is None:
	return None
	return extract_coordinates(bbox)


	BUILD_SELECTOR_FUNCTION = """
	const buildSelector = (el) => {
	if (!el \|\| el === document.body) return 'body';
	let selector = el.tagName.toLowerCase();
	if (el.id) return `#${el.id}`;
	if (el.classList.length) {
	selector += `.${Array.from(el.classList).join('.')}`;
	}
	const siblings = el.parentNode ? Array.from(el.parentNode.children) : [];
	if (siblings.length > 1) {
	const index = siblings.indexOf(el) + 1;
	selector += `:nth-child(${index})`;
	}
	const parentSelector = buildSelector(el.parentElement);
	return `${parentSelector} > ${selector}`;
	}

	"""

	SELECTOR_FROM_COORDS_FUNCTION = BUILD_SELECTOR_FUNCTION + """
	([x, y]) => {
	const element = document.elementFromPoint(x, y);
	return !element ? null : buildSelector(element);
	}
	"""

	DOM_CHUNK_FROM_BBOX_FUNCTION = BUILD_SELECTOR_FUNCTION + """
	([x1, y1, x2, y2, nParents = 0]) => {
	const elements = new Set();
	const step = 5; // To cover more points within the box

	for (let x = x1; x <= x2; x += step) {
	for (let y = y1; y <= y2; y += step) {
	const foundElements = document.elementsFromPoint(x, y);
	foundElements.forEach(el => elements.add(el));
	}
	}

	// Filter to the smallest element (least area)
	let smallestElement = null;
	let minArea = Infinity;

	elements.forEach(el => {
	const rect = el.getBoundingClientRect();
	const area = rect.width * rect.height;
	if (area > 0 && area < minArea) {
	minArea = area;
	smallestElement = el;
	}
	});

	if (!smallestElement) return [null, null];

	// Move up to nParents levels or until the top parent
	let currentElement = smallestElement;
	for (let i = 0; i < nParents && currentElement.parentElement; i++) {
	currentElement = currentElement.parentElement;
	}

	return [currentElement.outerHTML, buildSelector(currentElement)];
	}
	"""


	def draw_bounding_box(image_path, bbox, output_path=None, box_color=(255, 0, 0), box_width=3):
	"""
	Draws a bounding box on an image.

	Args:
	image_path (str): Path to the input image file.
	bbox (tuple): Bounding box coordinates in the format (x_min, y_min, x_max, y_max).
	output_path (str, optional): Path to save the output image. If None, displays the image.
	box_color (tuple, optional): RGB color of the bounding box. Default is red.
	box_width (int, optional): Width of the bounding box lines. Default is 3.
	"""
	image = Image.open(image_path)
	draw = ImageDraw.Draw(image)
	draw.rectangle(bbox, outline=box_color, width=box_width)

	if output_path:
	image.save(output_path)
	else:
	image.show()

	# grounding_model = OSAtlasBase7BAPI()

	def get_selector_and_dom(url: str, query: str):
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	page = browser.new_page(viewport={"width": 1920, "height": 1080})
	page.goto(url)

	screenshot_path = Path("screenshot.png")
	page.screenshot(path=str(screenshot_path))

	# bbox = grounding_model.get_bounding_box(query, screenshot_path)
	bbox = (416.0, 413.0, 618.0, 438.0)
	draw_bounding_box(screenshot_path, bbox)
	if bbox:
	x1, y1, x2, y2 = map(round, bbox)
	x, y = (x1 + x2) // 2, (y1 + y2) // 2

	# x1 = x1 - 0.1 * 1920
	# x2 = x2 + 0.1 * 1920
	# y1 = y1 - 0.1 * 1080
	# y2 = y2 + 0.1 * 1080
	draw_bounding_box(screenshot_path, (x1, y1, x2, y2), box_color=(0, 255, 0), box_width=1)
	print("Bounding box:", (x1, y1, x2, y2))
	print("Coordinates:", (x, y))

	selector_path = page.evaluate(SELECTOR_FROM_COORDS_FUNCTION, [x, y])
	dom_chunk = page.evaluate(DOM_CHUNK_FROM_BBOX_FUNCTION, [x1, y1, x2, y2, 1])

	browser.close()

	return selector_path, dom_chunk


	url = "https://llmagents-learning.org/sp25"
	query = "Prospective Students section"
	selector, dom_chunk = get_selector_and_dom(url, query)

	print("Selector:", selector)
	print("DOM Chunk:", dom_chunk)