from bs4 import BeautifulSoup
import re
import csv

def extract_urls_from_html(html_path):
    """Extract all URLs and their link text from an HTML file, including <a> tag links and plain text URLs."""
    
    with open(html_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    url_data = set()  # Use a set to avoid duplicates

    # Extract URLs from <a href="..."> and their link text
    for link in soup.find_all("a", href=True):
        url = link["href"].strip()
        link_text = link.get_text(strip=True)  # Extract visible link text
        url_data.add((url, link_text))

    # Extract URLs appearing as plain text
    url_pattern = re.compile(r"https?://[^\s\"'>]+")
    for text in soup.stripped_strings:
        for match in url_pattern.findall(text):
            url_data.add((match, ""))  # No link text for plain text URLs

    return list(url_data)

def save_urls_to_csv(url_data, output_file):
    """Save extracted URLs and their link text to a CSV file."""
    with open(output_file, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["URL", "Link Text"])  # Header row
        writer.writerows(url_data)
    print(f"URLs saved to {output_file}")

if __name__ == "__main__":
    html_path = "leads.html"  # Change this to your HTML file path
    extracted_urls = extract_urls_from_html(html_path)

    if extracted_urls:
        output_file = "lead_urls.csv"
        print("Extracted URLs:")
        for url, text in extracted_urls:
            print(f"URL: {url}, Link Text: {text}")
        save_urls_to_csv(extracted_urls, output_file)
    else:
        print("No URLs found in the HTML file.")