from bs4 import BeautifulSoup import re import csv def extract_urls_from_html(html_path): """Extract all URLs and their link text from an HTML file, including <a> tag links and plain text URLs.""" with open(html_path, "r", encoding="utf-8") as file: soup = BeautifulSoup(file, "html.parser") url_data = set() # Use a set to avoid duplicates # Extract URLs from <a href="..."> and their link text for link in soup.find_all("a", href=True): url = link["href"].strip() link_text = link.get_text(strip=True) # Extract visible link text url_data.add((url, link_text)) # Extract URLs appearing as plain text url_pattern = re.compile(r"https?://[^\s\"'>]+") for text in soup.stripped_strings: for match in url_pattern.findall(text): url_data.add((match, "")) # No link text for plain text URLs return list(url_data) def save_urls_to_csv(url_data, output_file): """Save extracted URLs and their link text to a CSV file.""" with open(output_file, "w", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(["URL", "Link Text"]) # Header row writer.writerows(url_data) print(f"URLs saved to {output_file}") if __name__ == "__main__": html_path = "leads.html" # Change this to your HTML file path extracted_urls = extract_urls_from_html(html_path) if extracted_urls: output_file = "lead_urls.csv" print("Extracted URLs:") for url, text in extracted_urls: print(f"URL: {url}, Link Text: {text}") save_urls_to_csv(extracted_urls, output_file) else: print("No URLs found in the HTML file.")